Anforderungen  |   Konzepte  |   Entwurf  |   Entwicklung  |   Qualitätssicherung  |   Lebenszyklus  |   Steuerung
 
 
 
 


Quelle  looprestoration_tmpl.S   Sprache: Sparc

 
/*
 * Copyright © 2018, VideoLAN and dav1d authors
 * Copyright © 2018, Martin Storsjo
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */


#include "src/arm/asm.S"

#define FILTER_OUT_STRIDE 384

.macro sgr_funcs bpc
// void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp,
//                                               const pixel *src,
//                                               const ptrdiff_t src_stride,
//                                               const int32_t **a,
//                                               const int16_t **b,
//                                               const int w, const int h);
function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1
        stp             d8,  d9,  [sp, #-0x40]!
        stp             d10, d11, [sp, #0x10]
        stp             d12, d13, [sp, #0x20]
        stp             d14, d15, [sp, #0x30]

        ldp             x7,  x8,  [x3]
        ldp             x9,  x3,  [x3, #16]
        ldp             x10, x11, [x4]
        ldp             x12, x4,  [x4, #16]

        mov             x13, #FILTER_OUT_STRIDE
        cmp             w6,  #1
        add             x2,  x1,  x2 // src + stride
        csel            x2,  x1,  x2,  le // if (h <= 1) x2 = x1
        add             x13, x0,  x13, lsl #1

        movi            v30.8h, #3
        movi            v31.4s, #3
1:
        ld1             {v0.8h, v1.8h}, [x10], #32
        ld1             {v2.8h, v3.8h}, [x11], #32
        ld1             {v4.8h, v5.8h}, [x12], #32
        ld1             {v6.8h, v7.8h}, [x4],  #32
        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
        ld1             {v19.4s, v20.4s, v21.4s}, [x8], #48
        ld1             {v22.4s, v23.4s, v24.4s}, [x9], #48
        ld1             {v25.4s, v26.4s, v27.4s}, [x3], #48

2:
        ext             v8.16b,  v0.16b,  v1.16b, #2  // [0][1]
        ext             v9.16b,  v2.16b,  v3.16b, #2  // [1][1]
        ext             v10.16b, v4.16b,  v5.16b, #2  // [2][1]
        ext             v11.16b, v0.16b,  v1.16b, #4  // [0][2]
        ext             v12.16b, v2.16b,  v3.16b, #4  // [1][2]
        ext             v13.16b, v4.16b,  v5.16b, #4  // [2][2]

        add             v14.8h,  v2.8h,   v8.8h       // [1][0] + [0][1]
        add             v15.8h,  v9.8h,   v10.8h      // [1][1] + [2][1]

        add             v28.8h,  v0.8h,   v11.8h      // [0][0] + [0][2]
        add             v14.8h,  v14.8h,  v12.8h      // () + [1][2]
        add             v29.8h,  v4.8h,   v13.8h      // [2][0] + [2][2]

        ext             v8.16b,  v6.16b,  v7.16b, #2  // [3][1]
        ext             v11.16b, v6.16b,  v7.16b, #4  // [3][2]

        add             v14.8h,  v14.8h,  v15.8h      // mid
        add             v15.8h,  v28.8h,  v29.8h      // corners

        add             v28.8h,  v4.8h,   v9.8h       // [2][0] + [1][1]
        add             v29.8h,  v10.8h,  v8.8h       // [2][1] + [3][1]

        add             v2.8h,   v2.8h,   v12.8h      // [1][0] + [1][2]
        add             v28.8h,  v28.8h,  v13.8h      // () + [2][2]
        add             v4.8h,   v6.8h,   v11.8h      // [3][0] + [3][2]

        add             v0.8h,   v28.8h,  v29.8h      // mid
        add             v2.8h,   v2.8h,   v4.8h       // corners

        shl             v4.8h,   v14.8h,  #2
        mla             v4.8h,   v15.8h,  v30.8h      // * 3 -> a

        shl             v0.8h,   v0.8h,   #2
        mla             v0.8h,   v2.8h,   v30.8h      // * 3 -> a

        ext             v8.16b,  v16.16b, v17.16b, #4 // [0][1]
        ext             v9.16b,  v17.16b, v18.16b, #4
        ext             v10.16b, v16.16b, v17.16b, #8 // [0][2]
        ext             v11.16b, v17.16b, v18.16b, #8
        ext             v12.16b, v19.16b, v20.16b, #4 // [1][1]
        ext             v13.16b, v20.16b, v21.16b, #4
        add             v8.4s,   v8.4s,   v19.4s      // [0][1] + [1][0]
        add             v9.4s,   v9.4s,   v20.4s
        add             v16.4s,  v16.4s,  v10.4s      // [0][0] + [0][2]
        add             v17.4s,  v17.4s,  v11.4s
        ext             v14.16b, v19.16b, v20.16b, #8 // [1][2]
        ext             v15.16b, v20.16b, v21.16b, #8
        add             v16.4s,  v16.4s,  v22.4s      // () + [2][0]
        add             v17.4s,  v17.4s,  v23.4s
        add             v28.4s,  v12.4s,  v14.4s      // [1][1] + [1][2]
        add             v29.4s,  v13.4s,  v15.4s
        ext             v10.16b, v22.16b, v23.16b, #4 // [2][1]
        ext             v11.16b, v23.16b, v24.16b, #4
        add             v8.4s,   v8.4s,   v28.4s      // mid (incomplete)
        add             v9.4s,   v9.4s,   v29.4s

        add             v19.4s,  v19.4s,  v14.4s      // [1][0] + [1][2]
        add             v20.4s,  v20.4s,  v15.4s
        add             v14.4s,  v22.4s,  v12.4s      // [2][0] + [1][1]
        add             v15.4s,  v23.4s,  v13.4s

        ext             v12.16b, v22.16b, v23.16b, #8 // [2][2]
        ext             v13.16b, v23.16b, v24.16b, #8
        ext             v28.16b, v25.16b, v26.16b, #4 // [3][1]
        ext             v29.16b, v26.16b, v27.16b, #4
        add             v8.4s,   v8.4s,   v10.4s      // () + [2][1] = mid
        add             v9.4s,   v9.4s,   v11.4s
        add             v14.4s,  v14.4s,  v10.4s      // () + [2][1]
        add             v15.4s,  v15.4s,  v11.4s
        ext             v10.16b, v25.16b, v26.16b, #8 // [3][2]
        ext             v11.16b, v26.16b, v27.16b, #8
        add             v16.4s,  v16.4s,  v12.4s      // () + [2][2] = corner
        add             v17.4s,  v17.4s,  v13.4s

        add             v12.4s,  v12.4s,  v28.4s      // [2][2] + [3][1]
        add             v13.4s,  v13.4s,  v29.4s
        add             v25.4s,  v25.4s,  v10.4s      // [3][0] + [3][2]
        add             v26.4s,  v26.4s,  v11.4s

        add             v14.4s,  v14.4s,  v12.4s      // mid
        add             v15.4s,  v15.4s,  v13.4s
        add             v19.4s,  v19.4s,  v25.4s      // corner
        add             v20.4s,  v20.4s,  v26.4s

.if \bpc == 8
        ld1             {v25.8b}, [x1], #8            // src
        ld1             {v26.8b}, [x2], #8
.else
        ld1             {v25.8h}, [x1], #16           // src
        ld1             {v26.8h}, [x2], #16
.endif

        shl             v8.4s,   v8.4s,   #2
        shl             v9.4s,   v9.4s,   #2
        mla             v8.4s,   v16.4s,  v31.4s      // * 3 -> b
        mla             v9.4s,   v17.4s,  v31.4s

.if \bpc == 8
        uxtl            v25.8h,  v25.8b               // src
        uxtl            v26.8h,  v26.8b
.endif

        shl             v14.4s,  v14.4s,  #2
        shl             v15.4s,  v15.4s,  #2
        mla             v14.4s,  v19.4s,  v31.4s      // * 3 -> b
        mla             v15.4s,  v20.4s,  v31.4s

        umlsl           v8.4s,   v4.4h,   v25.4h      // b - a * src
        umlsl2          v9.4s,   v4.8h,   v25.8h
        umlsl           v14.4s,  v0.4h,   v26.4h      // b - a * src
        umlsl2          v15.4s,  v0.8h,   v26.8h
        mov             v0.16b,  v1.16b
        rshrn           v8.4h,   v8.4s,   #9
        rshrn2          v8.8h,   v9.4s,   #9
        mov             v2.16b,  v3.16b
        rshrn           v14.4h,  v14.4s,  #9
        rshrn2          v14.8h,  v15.4s,  #9
        subs            w5,  w5,  #8
        mov             v4.16b,  v5.16b
        st1             {v8.8h},  [x0],  #16
        mov             v6.16b,  v7.16b
        st1             {v14.8h}, [x13], #16

        b.le            3f
        mov             v16.16b, v18.16b
        mov             v19.16b, v21.16b
        mov             v22.16b, v24.16b
        mov             v25.16b, v27.16b
        ld1             {v1.8h}, [x10], #16
        ld1             {v3.8h}, [x11], #16
        ld1             {v5.8h}, [x12], #16
        ld1             {v7.8h}, [x4],  #16
        ld1             {v17.4s, v18.4s}, [x7], #32
        ld1             {v20.4s, v21.4s}, [x8], #32
        ld1             {v23.4s, v24.4s}, [x9], #32
        ld1             {v26.4s, v27.4s}, [x3], #32
        b               2b

3:
        ldp             d14, d15, [sp, #0x30]
        ldp             d12, d13, [sp, #0x20]
        ldp             d10, d11, [sp, #0x10]
        ldp             d8,  d9,  [sp], 0x40
        ret
endfunc

// void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst,
//                                           const int32_t **a, const int16_t **b,
//                                           const int w, const int w1,
//                                           const int bitdepth_max);
function sgr_finish_weighted1_\bpc\()bpc_neon, export=1
        ldp             x7,  x8,  [x1]
        ldr             x1,       [x1, #16]
        ldp             x9,  x10, [x2]
        ldr             x2,       [x2, #16]

        dup             v31.8h, w4
        dup             v30.8h, w5

        movi            v6.8h,  #3
        movi            v7.4s,  #3
1:
        ld1             {v0.8h, v1.8h}, [x9],  #32
        ld1             {v2.8h, v3.8h}, [x10], #32
        ld1             {v4.8h, v5.8h}, [x2],  #32
        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
        ld1             {v19.4s, v20.4s, v21.4s}, [x8], #48
        ld1             {v22.4s, v23.4s, v24.4s}, [x1], #48

2:
        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride
        ext             v26.16b, v2.16b,  v3.16b, #2  // 0
        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride
        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride
        ext             v29.16b, v2.16b,  v3.16b, #4  // +1
        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride
        ext             v25.16b, v4.16b,  v5.16b, #4  // +1+stride
        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride
        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride
        add             v2.8h,   v2.8h,   v26.8h
        add             v4.8h,   v4.8h,   v25.8h      // -1+stride, +1+stride
        add             v2.8h,   v2.8h,   v29.8h      // +1
        add             v0.8h,   v0.8h,   v4.8h

        ext             v25.16b, v16.16b, v17.16b, #4 // -stride
        ext             v26.16b, v17.16b, v18.16b, #4
        shl             v2.8h,   v2.8h,   #2
        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride
        ext             v28.16b, v17.16b, v18.16b, #8
        ext             v29.16b, v19.16b, v20.16b, #4 // 0
        ext             v4.16b,  v20.16b, v21.16b, #4
        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a
        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1
        add             v26.4s,  v26.4s,  v20.4s
        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride
        add             v17.4s,  v17.4s,  v28.4s
        ext             v27.16b, v19.16b, v20.16b, #8 // +1
        ext             v28.16b, v20.16b, v21.16b, #8
        add             v16.4s,  v16.4s,  v22.4s      // -1+stride
        add             v17.4s,  v17.4s,  v23.4s
        add             v29.4s,  v29.4s,  v27.4s      // 0, +1
        add             v4.4s,   v4.4s,   v28.4s
        add             v25.4s,  v25.4s,  v29.4s
        add             v26.4s,  v26.4s,  v4.4s
        ext             v27.16b, v22.16b, v23.16b, #4 // +stride
        ext             v28.16b, v23.16b, v24.16b, #4
        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride
        ext             v4.16b,  v23.16b, v24.16b, #8
.if \bpc == 8
        ld1             {v19.8b}, [x0]                // src
.else
        ld1             {v19.8h}, [x0]                // src
.endif
        add             v25.4s,  v25.4s,  v27.4s      // +stride
        add             v26.4s,  v26.4s,  v28.4s
        add             v16.4s,  v16.4s,  v29.4s      // +1+stride
        add             v17.4s,  v17.4s,  v4.4s
        shl             v25.4s,  v25.4s,  #2
        shl             v26.4s,  v26.4s,  #2
        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b
        mla             v26.4s,  v17.4s,  v7.4s
.if \bpc == 8
        uxtl            v19.8h,  v19.8b               // src
.endif
        mov             v0.16b,  v1.16b
        umlsl           v25.4s,  v2.4h,   v19.4h      // b - a * src
        umlsl2          v26.4s,  v2.8h,   v19.8h
        mov             v2.16b,  v3.16b
        rshrn           v25.4h,  v25.4s,  #9
        rshrn2          v25.8h,  v26.4s,  #9

        subs            w3,  w3,  #8

        // weighted1
        mov             v4.16b,  v5.16b

        ld1             {v1.8h}, [x9],  #16
        ld1             {v3.8h}, [x10], #16
        smull           v26.4s,  v25.4h,  v31.4h // v = t1 * w1
        smull2          v27.4s,  v25.8h,  v31.8h
        ld1             {v5.8h}, [x2],  #16
        rshrn           v26.4h,  v26.4s,  #11
        rshrn2          v26.8h,  v27.4s,  #11
        usqadd          v19.8h,  v26.8h
.if \bpc == 8
        mov             v16.16b, v18.16b
        sqxtun          v26.8b,  v19.8h
        mov             v19.16b, v21.16b
        mov             v22.16b, v24.16b
        st1             {v26.8b}, [x0], #8
.else
        mov             v16.16b, v18.16b
        umin            v26.8h,  v19.8h,  v30.8h
        mov             v19.16b, v21.16b
        mov             v22.16b, v24.16b
        st1             {v26.8h}, [x0], #16
.endif

        b.le            3f
        ld1             {v17.4s, v18.4s}, [x7], #32
        ld1             {v20.4s, v21.4s}, [x8], #32
        ld1             {v23.4s, v24.4s}, [x1], #32
        b               2b

3:
        ret
endfunc

// void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,
//                                               const pixel *src,
//                                               const ptrdiff_t stride,
//                                               const int32_t **a,
//                                               const int16_t **b,
//                                               const int w, const int h);
function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1
        stp             d8,  d9,  [sp, #-0x40]!
        stp             d10, d11, [sp, #0x10]
        stp             d12, d13, [sp, #0x20]
        stp             d14, d15, [sp, #0x30]

        ldp             x3,  x7,  [x3]
        ldp             x4,  x8,  [x4]
        mov             x10, #FILTER_OUT_STRIDE
        cmp             w6,  #1
        add             x2,  x1,  x2 // src + stride
        csel            x2,  x1,  x2,  le // if (h <= 1) x2 = x1
        add             x10, x0,  x10, lsl #1
        movi            v4.8h,  #5
        movi            v5.4s,  #5
        movi            v6.8h,  #6
        movi            v7.4s,  #6
1:
        ld1             {v0.8h, v1.8h}, [x4], #32
        ld1             {v2.8h, v3.8h}, [x8], #32
        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48

2:
        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
        add             v0.8h,   v0.8h,   v25.8h

        mul             v8.8h,   v25.8h,  v4.8h       // * 5
        mla             v8.8h,   v23.8h,  v6.8h       // * 6

        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
        ext             v23.16b, v17.16b, v18.16b, #4
        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
        ext             v25.16b, v20.16b, v21.16b, #4
        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
        ext             v27.16b, v17.16b, v18.16b, #8
        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
        ext             v29.16b, v20.16b, v21.16b, #8
        mul             v0.8h,   v0.8h,   v4.8h       // * 5
        mla             v0.8h,   v2.8h,   v6.8h       // * 6
.if \bpc == 8
        ld1             {v31.8b}, [x1], #8
        ld1             {v30.8b}, [x2], #8
.else
        ld1             {v31.8h}, [x1], #16
        ld1             {v30.8h}, [x2], #16
.endif
        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
        add             v17.4s,  v17.4s,  v27.4s
        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
        add             v20.4s,  v20.4s,  v29.4s
        add             v16.4s,  v16.4s,  v19.4s
        add             v17.4s,  v17.4s,  v20.4s

        mul             v9.4s,   v19.4s,  v5.4s       // * 5
        mla             v9.4s,   v24.4s,  v7.4s       // * 6
        mul             v10.4s,  v20.4s,  v5.4s       // * 5
        mla             v10.4s,  v25.4s,  v7.4s       // * 6

        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
        add             v23.4s,  v23.4s,  v25.4s
        // This is, surprisingly, faster than other variants where the
        // mul+mla pairs are further apart, on Cortex A53.
        mul             v16.4s,  v16.4s,  v5.4s       // * 5
        mla             v16.4s,  v22.4s,  v7.4s       // * 6
        mul             v17.4s,  v17.4s,  v5.4s       // * 5
        mla             v17.4s,  v23.4s,  v7.4s       // * 6

.if \bpc == 8
        uxtl            v31.8h,  v31.8b
        uxtl            v30.8h,  v30.8b
.endif
        umlsl           v16.4s,  v0.4h,   v31.4h      // b - a * src
        umlsl2          v17.4s,  v0.8h,   v31.8h
        umlsl           v9.4s,   v8.4h,   v30.4h      // b - a * src
        umlsl2          v10.4s,  v8.8h,   v30.8h
        mov             v0.16b,  v1.16b
        rshrn           v16.4h,  v16.4s,  #9
        rshrn2          v16.8h,  v17.4s,  #9
        rshrn           v9.4h,   v9.4s,   #8
        rshrn2          v9.8h,   v10.4s,  #8
        subs            w5,  w5,  #8
        mov             v2.16b,  v3.16b
        st1             {v16.8h}, [x0],  #16
        st1             {v9.8h},  [x10], #16

        b.le            9f
        mov             v16.16b, v18.16b
        mov             v19.16b, v21.16b
        ld1             {v1.8h}, [x4], #16
        ld1             {v3.8h}, [x8], #16
        ld1             {v17.4s, v18.4s}, [x3], #32
        ld1             {v20.4s, v21.4s}, [x7], #32
        b               2b

9:
        ldp             d14, d15, [sp, #0x30]
        ldp             d12, d13, [sp, #0x20]
        ldp             d10, d11, [sp, #0x10]
        ldp             d8,  d9,  [sp], 0x40
        ret
endfunc

// void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
//                                           const int32_t **a,
//                                           const int16_t **b,
//                                           const int w, const int h,
//                                           const int w1,
//                                           const int bitdepth_max);
function sgr_finish_weighted2_\bpc\()bpc_neon, export=1
        stp             d8,  d9,  [sp, #-0x30]!
        str             d10,      [sp, #0x10]
        stp             d14, d15, [sp, #0x20]

        dup             v14.8h, w6
        dup             v15.8h, w7

        ldp             x2,  x7,  [x2]
        ldp             x3,  x8,  [x3]
        cmp             w5,  #1
        add             x1,  x0,  x1 // src + stride
        // if (h <= 1), set the pointer to the second row to any dummy buffer
        // we can clobber (x2 in this case)
        csel            x1,  x2,  x1,  le
        movi            v4.8h,  #5
        movi            v5.4s,  #5
        movi            v6.8h,  #6
        movi            v7.4s,  #6
1:
        ld1             {v0.8h, v1.8h}, [x3], #32
        ld1             {v2.8h, v3.8h}, [x8], #32
        ld1             {v16.4s, v17.4s, v18.4s}, [x2], #48
        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48

2:
        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
        add             v0.8h,   v0.8h,   v25.8h

        mul             v8.8h,   v25.8h,  v4.8h       // * 5
        mla             v8.8h,   v23.8h,  v6.8h       // * 6

        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
        ext             v23.16b, v17.16b, v18.16b, #4
        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
        ext             v25.16b, v20.16b, v21.16b, #4
        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
        ext             v27.16b, v17.16b, v18.16b, #8
        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
        ext             v29.16b, v20.16b, v21.16b, #8
        mul             v0.8h,   v0.8h,   v4.8h       // * 5
        mla             v0.8h,   v2.8h,   v6.8h       // * 6
.if \bpc == 8
        ld1             {v31.8b}, [x0]
        ld1             {v30.8b}, [x1]
.else
        ld1             {v31.8h}, [x0]
        ld1             {v30.8h}, [x1]
.endif
        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
        add             v17.4s,  v17.4s,  v27.4s
        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
        add             v20.4s,  v20.4s,  v29.4s
        add             v16.4s,  v16.4s,  v19.4s
        add             v17.4s,  v17.4s,  v20.4s

        mul             v9.4s,   v19.4s,  v5.4s       // * 5
        mla             v9.4s,   v24.4s,  v7.4s       // * 6
        mul             v10.4s,  v20.4s,  v5.4s       // * 5
        mla             v10.4s,  v25.4s,  v7.4s       // * 6

        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
        add             v23.4s,  v23.4s,  v25.4s
        // This is, surprisingly, faster than other variants where the
        // mul+mla pairs are further apart, on Cortex A53.
        mul             v16.4s,  v16.4s,  v5.4s       // * 5
        mla             v16.4s,  v22.4s,  v7.4s       // * 6
        mul             v17.4s,  v17.4s,  v5.4s       // * 5
        mla             v17.4s,  v23.4s,  v7.4s       // * 6

.if \bpc == 8
        uxtl            v31.8h,  v31.8b
        uxtl            v30.8h,  v30.8b
.endif
        umlsl           v16.4s,  v0.4h,   v31.4h      // b - a * src
        umlsl2          v17.4s,  v0.8h,   v31.8h
        umlsl           v9.4s,   v8.4h,   v30.4h      // b - a * src
        umlsl2          v10.4s,  v8.8h,   v30.8h
        mov             v0.16b,  v1.16b
        rshrn           v16.4h,  v16.4s,  #9
        rshrn2          v16.8h,  v17.4s,  #9
        rshrn           v9.4h,   v9.4s,   #8
        rshrn2          v9.8h,   v10.4s,  #8

        subs            w4,  w4,  #8

        // weighted1
        mov             v2.16b,  v3.16b

        ld1             {v1.8h}, [x3], #16
        ld1             {v3.8h}, [x8], #16
        smull           v22.4s,  v16.4h,  v14.4h // v
        smull2          v23.4s,  v16.8h,  v14.8h
        mov             v16.16b, v18.16b
        smull           v24.4s,  v9.4h,   v14.4h
        smull2          v25.4s,  v9.8h,   v14.8h
        mov             v19.16b, v21.16b
        rshrn           v22.4h,  v22.4s,  #11
        rshrn2          v22.8h,  v23.4s,  #11
        rshrn           v23.4h,  v24.4s,  #11
        rshrn2          v23.8h,  v25.4s,  #11
        usqadd          v31.8h,  v22.8h
        usqadd          v30.8h,  v23.8h
.if \bpc == 8
        sqxtun          v22.8b,  v31.8h
        sqxtun          v23.8b,  v30.8h
        st1             {v22.8b}, [x0], #8
        st1             {v23.8b}, [x1], #8
.else
        umin            v22.8h,  v31.8h,  v15.8h
        umin            v23.8h,  v30.8h,  v15.8h
        st1             {v22.8h}, [x0], #16
        st1             {v23.8h}, [x1], #16
.endif

        b.le            3f
        ld1             {v17.4s, v18.4s}, [x2], #32
        ld1             {v20.4s, v21.4s}, [x7], #32
        b               2b

3:
        ldp             d14, d15, [sp, #0x20]
        ldr             d10,      [sp, #0x10]
        ldp             d8,  d9,  [sp], 0x30
        ret
endfunc

// void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
//                                    const int16_t *t1, const int16_t *t2,
//                                    const int w, const int h,
//                                    const int16_t wt[2], const int bitdepth_max);
function sgr_weighted2_\bpc\()bpc_neon, export=1
        cmp             w5,  #2
        add             x10, x0,  x1
        add             x12, x2,  #2*FILTER_OUT_STRIDE
        add             x13, x3,  #2*FILTER_OUT_STRIDE
        ld2r            {v30.8h, v31.8h}, [x6] // wt[0], wt[1]
.if \bpc == 16
        dup             v29.8h,  w7
.endif
        mov             x8,  #4*FILTER_OUT_STRIDE
        lsl             x1,  x1,  #1
        add             w9,  w4,  #7
        bic             x9,  x9,  #7 // Aligned width
.if \bpc == 8
        sub             x1,  x1,  x9
.else
        sub             x1,  x1,  x9, lsl #1
.endif
        sub             x8,  x8,  x9, lsl #1
        mov             w9,  w4
        b.lt            2f
1:
.if \bpc == 8
        ld1             {v0.8b},  [x0]
        ld1             {v16.8b}, [x10]
.else
        ld1             {v0.8h},  [x0]
        ld1             {v16.8h}, [x10]
.endif
        ld1             {v1.8h},  [x2],  #16
        ld1             {v17.8h}, [x12], #16
        ld1             {v2.8h},  [x3],  #16
        ld1             {v18.8h}, [x13], #16
        subs            w4,  w4,  #8
.if \bpc == 8
        uxtl            v0.8h,  v0.8b
        uxtl            v16.8h, v16.8b
.endif
        smull           v3.4s,  v1.4h,  v30.4h // wt[0] * t1
        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * t2
        smull2          v4.4s,  v1.8h,  v30.8h // wt[0] * t1
        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * t2
        smull           v19.4s, v17.4h, v30.4h // wt[0] * t1
        smlal           v19.4s, v18.4h, v31.4h // wt[1] * t2
        smull2          v20.4s, v17.8h, v30.8h // wt[0] * t1
        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * t2
        rshrn           v3.4h,  v3.4s,  #11
        rshrn2          v3.8h,  v4.4s,  #11
        rshrn           v19.4h, v19.4s, #11
        rshrn2          v19.8h, v20.4s, #11
        usqadd          v0.8h,  v3.8h
        usqadd          v16.8h, v19.8h
.if \bpc == 8
        sqxtun          v3.8b,  v0.8h
        sqxtun          v19.8b, v16.8h
        st1             {v3.8b},  [x0],  #8
        st1             {v19.8b}, [x10], #8
.else
        umin            v3.8h,  v0.8h,  v29.8h
        umin            v19.8h, v16.8h, v29.8h
        st1             {v3.8h},  [x0],  #16
        st1             {v19.8h}, [x10], #16
.endif
        b.gt            1b

        subs            w5,  w5,  #2
        cmp             w5,  #1
        b.lt            0f
        mov             w4,  w9
        add             x0,  x0,  x1
        add             x10, x10, x1
        add             x2,  x2,  x8
        add             x12, x12, x8
        add             x3,  x3,  x8
        add             x13, x13, x8
        b.eq            2f
        b               1b

2:
.if \bpc == 8
        ld1             {v0.8b}, [x0]
.else
        ld1             {v0.8h}, [x0]
.endif
        ld1             {v1.8h}, [x2], #16
        ld1             {v2.8h}, [x3], #16
        subs            w4,  w4,  #8
.if \bpc == 8
        uxtl            v0.8h,  v0.8b
.endif
        smull           v3.4s,  v1.4h,  v30.4h // wt[0] * t1
        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * t2
        smull2          v4.4s,  v1.8h,  v30.8h // wt[0] * t1
        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * t2
        rshrn           v3.4h,  v3.4s,  #11
        rshrn2          v3.8h,  v4.4s,  #11
        usqadd          v0.8h,  v3.8h
.if \bpc == 8
        sqxtun          v3.8b,  v0.8h
        st1             {v3.8b}, [x0], #8
.else
        umin            v3.8h,  v0.8h,  v29.8h
        st1             {v3.8h}, [x0], #16
.endif
        b.gt            2b
0:
        ret
endfunc
.endm

Messung V0.5
C=100 H=100 G=100

¤ Dauer der Verarbeitung: 0.15 Sekunden  (vorverarbeitet)  ¤

*© Formatika GbR, Deutschland






Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.






                                                                                                                                                                                                                                                                                                                                                                                                     


Neuigkeiten

     Aktuelles
     Motto des Tages

Software

     Produkte
     Quellcodebibliothek

Aktivitäten

     Artikel über Sicherheit
     Anleitung zur Aktivierung von SSL

Muße

     Gedichte
     Musik
     Bilder

Jenseits des Üblichen ....

Besucherstatistik

Besucherstatistik

Monitoring

Montastic status badge