Quelle msac.S

Sprache: Sparc

/*
* Copyright © 2019, VideoLAN and dav1d authors
* Copyright © 2019, Martin Storsjo
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
*    list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
*    this list of conditions and the following disclaimer in the documentation
*    and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "src/arm/asm.S"
#include "util.S"

#define BUF_POS 0
#define BUF_END 8
#define DIF 16
#define RNG 24
#define CNT 28
#define ALLOW_UPDATE_CDF 32

#define COEFFS_BASE_OFFSET 30
#define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET)

const coeffs
        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
        // masks8
        .short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E
endconst

.macro ld1_n d0, d1, src, sz, n
.if \n <= 8
        ld1             {\d0\sz},  [\src]
.else
        ld1             {\d0\sz, \d1\sz},  [\src]
.endif
.endm

.macro st1_n s0, s1, dst, sz, n
.if \n <= 8
        st1             {\s0\sz},  [\dst]
.else
        st1             {\s0\sz, \s1\sz},  [\dst]
.endif
.endm

.macro ushr_n d0, d1, s0, s1, shift, sz, n
        ushr            \d0\sz,  \s0\sz,  \shift
.if \n == 16
        ushr            \d1\sz,  \s1\sz,  \shift
.endif
.endm

.macro add_n d0, d1, s0, s1, s2, s3, sz, n
        add             \d0\sz,  \s0\sz,  \s2\sz
.if \n == 16
        add             \d1\sz,  \s1\sz,  \s3\sz
.endif
.endm

.macro sub_n d0, d1, s0, s1, s2, s3, sz, n
        sub             \d0\sz,  \s0\sz,  \s2\sz
.if \n == 16
        sub             \d1\sz,  \s1\sz,  \s3\sz
.endif
.endm

.macro and_n d0, d1, s0, s1, s2, s3, sz, n
        and             \d0\sz,  \s0\sz,  \s2\sz
.if \n == 16
        and             \d1\sz,  \s1\sz,  \s3\sz
.endif
.endm

.macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
        cmhs            \d0\sz,  \s0\sz,  \s2\sz
.if \n == 16
        cmhs            \d1\sz,  \s1\sz,  \s3\sz
.endif
.endm

.macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
        sshl            \d0\sz,  \s0\sz,  \s2\sz
.if \n == 16
        sshl            \d1\sz,  \s1\sz,  \s3\sz
.endif
.endm

.macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
        sqdmulh         \d0\sz,  \s0\sz,  \s2\sz
.if \n == 16
        sqdmulh         \d1\sz,  \s1\sz,  \s3\sz
.endif
.endm

.macro str_n            idx0, idx1, dstreg, dstoff, n
        str             \idx0,  [\dstreg, \dstoff]
.if \n == 16
        str             \idx1,  [\dstreg, \dstoff + 16]
.endif
.endm

// unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
//                                               size_t n_symbols);

function msac_decode_symbol_adapt4_neon, export=1
.macro decode_update sz, szb, n
.if \n == 16
        sub             sp,  sp,  #48
.endif
        add             x8,  x0,  #RNG
        ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf
        ld1r            {v29\sz}, [x8]                            // rng
        movrel          x9,  coeffs, COEFFS_BASE_OFFSET
        movi            v31\sz, #0x7f, lsl #8                     // 0x7f00
        sub             x10, x9,  x2, lsl #1
        mvni            v30\sz, #0x3f                             // 0xffc0
        and             v7\szb, v29\szb, v31\szb                  // rng & 0x7f00
.if \n == 16
        str             h29, [sp, #14]                            // store original u = s->rng
.endif
        and_n           v2,  v3,  v0,  v1,  v30, v30, \szb, \n    // cdf & 0xffc0

        ld1_n           v4,  v5,  x10, \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)
        sqdmulh_n       v6,  v7,  v2,  v3,  v7,  v7,  \sz, \n     // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
        ldr             d28, [x0, #DIF]

        add_n           v4,  v5,  v2,  v3,  v4,  v5,  \sz, \n     // v = cdf + EC_MIN_PROB * (n_symbols - ret)
        add_n           v4,  v5,  v6,  v7,  v4,  v5,  \sz, \n     // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)

        dup             v30\sz, v28.h[3]                          // dif >> (EC_WIN_SIZE - 16)
.if \n == 8
        ldur            q31, [x9, #MASKS8_OFFSET]
.elseif \n == 16
        str_n           q4,  q5,  sp, #16, \n                     // store v values to allow indexed access
.endif

        // After the condition starts being true it continues, such that the vector looks like:
        //   0, 0, 0 ... -1, -1
        cmhs_n          v2,  v3,  v30, v30, v4,  v5,  \sz,  \n    // c >= v
.if \n == 4
        ext             v29\szb, v29\szb, v4\szb, #6              // u
        umov            x15, v2.d[0]
        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
        rev             x15, x15
        sub             v29\sz, v29\sz, v4\sz                     // rng = u-v
        // rev + clz = count trailing zeros
        clz             x15, x15                                  // 16*ret
.elseif \n == 8
        // The final short of the compare is always set.
        // Using addv, subtract -0x202*ret from this value to create a lookup table for a short.
        //  For n == 8:
        // -0x202 + -0x202 + ... + 0xF0E
        //                    (0x202*7) | (1 << 8)
        //                                    ^-------offset for second byte of the short
        and             v31\szb, v31\szb, v2\szb
        ext             v29\szb, v29\szb, v4\szb, #14             // u
        addv            h31, v31\sz                               // ((2*ret + 1) << 8) | (2*ret)
        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
        sub             v30\sz, v30\sz, v4\sz                     // (dif >> 48) - v
        smov            w15, v31.b[0]                             // 2*ret
        sub             v29\sz, v29\sz, v4\sz                     // rng = u-v
.elseif \n == 16
        add             v6\sz,  v2\sz,  v3\sz
        addv            h31, v6\sz                                // -n + ret
        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
        smov            w15, v31.h[0]
.endif

        cbz             w4,  0f

        // update_cdf
        ldrh            w3,  [x1, x2, lsl #1]                     // count = cdf[n_symbols]
.if \n == 16
        // 16 case has a lower bound that guarantees n_symbols > 2
        mov             w4,  #-5
.elseif \n == 8
        mvn             w14, w2
        mov             w4,  #-4
        cmn             w14, #3                                   // set C if n_symbols <= 2
.else
        // if n_symbols < 4 (or < 6 even) then
        //   (1 + n_symbols) >> 2 == n_symbols > 2
        add             w14, w2,  #17                             // (1 + n_symbols) + (4 << 2)
.endif
        sub_n           v16, v17, v0,  v1,  v2,  v3,  \sz, \n     // cdf + (i >= val ? 1 : 0)
        orr             v2\sz, #0x80, lsl #8
.if \n == 16
        orr             v3\sz, #0x80, lsl #8
.endif
.if \n == 16
        sub             w4,  w4,  w3, lsr #4                      // -((count >> 4) + 5)
.elseif \n == 8
        lsr             w14, w3,  #4                              // count >> 4
        sbc             w4,  w4,  w14                             // -((count >> 4) + (n_symbols > 2) + 4)
.else
        neg             w4, w14, lsr #2                           // -((n_symbols > 2) + 4)
        sub             w4,  w4,  w3,  lsr #4                     // -((count >> 4) + (n_symbols > 2) + 4)
.endif
        sub_n           v2,  v3,  v2,  v3,  v0,  v1,  \sz, \n     // (32768 - cdf[i]) or (-1 - cdf[i])
        dup             v6\sz,    w4                              // -rate

        sub             w3,  w3,  w3, lsr #5                      // count - (count == 32)
        sshl_n          v2,  v3,  v2,  v3,  v6,  v6,  \sz, \n     // ({32768,-1} - cdf[i]) >> rate
        add             w3,  w3,  #1                              // count + (count < 32)
        add_n           v0,  v1,  v16, v17, v2,  v3,  \sz, \n     // cdf + (32768 - cdf[i]) >> rate
        st1_n           v0,  v1,  x1,  \sz, \n
        strh            w3,  [x1, x2, lsl #1]

0:
        // renorm
.if \n == 4
        ldr             w6,  [x0, #CNT]
        ldr             x7,  [x0, #DIF]
        mov             x4,  v29.d[0]          // rng (packed)
        mov             x3,  v4.d[0]           // v (packed)

        // Shift 'v'/'rng' for ret into the 16 least sig bits. There is
        //  garbage in the remaining bits, but we can work around this.
        lsr             x4,  x4,  x15          // rng
        lsr             x3,  x3,  x15          // v
        lsl             w5,  w4,  #16          // rng << 16
        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
        clz             w5,  w5                // d = clz(rng << 16)
        lsl             w4,  w4,  w5           // rng << d
        subs            w6,  w6,  w5           // cnt -= d
        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
        strh            w4,  [x0, #RNG]
        b.lo            1f
        str             w6,  [x0, #CNT]
        str             x7,  [x0, #DIF]
        lsr             w0,  w15, #4
        ret
1:
        lsr             w15, w15, #4
        b L(refill)
.elseif \n == 8
        ldr             w6,  [x0, #CNT]
        tbl             v30.8b, {v30.16b}, v31.8b
        tbl             v29.8b, {v29.16b}, v31.8b
        ins             v28.h[3], v30.h[0]     // dif - (v << 48)
        clz             v0.4h,  v29.4h         // d = clz(rng)
        umov            w5,  v0.h[0]
        ushl            v29.4h, v29.4h, v0.4h  // rng << d

        // The vec for clz(rng) is filled with garbage after the first short,
        //  but ushl/sshl conveniently uses only the first byte for the shift
        //  amount.
        ushl            d28, d28, d0           // (dif - (v << 48)) << d

        subs            w6,  w6,  w5           // cnt -= d
        str             h29, [x0, #RNG]
        b.lo            1f
        str             w6,  [x0, #CNT]
        str             d28, [x0, #DIF]
        lsr             w0,  w15, #1           // ret
        ret
1:
        lsr             w15, w15, #1           // ret
        mov             x7, v28.d[0]
        b L(refill)
.elseif \n == 16
        add             x8,  sp,  w15, sxtw #1
        ldrh            w3,  [x8, #48]         // v
        ldurh           w4,  [x8, #46]         // u
        ldr             w6,  [x0, #CNT]
        ldr             x7,  [x0, #DIF]
        sub             w4,  w4,  w3           // rng = u - v
        clz             w5,  w4                // clz(rng)
        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
        lsl             w4,  w4,  w5           // rng << d
        subs            w6,  w6,  w5           // cnt -= d
        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
        str             w4,  [x0, #RNG]
        add             sp,  sp,  #48
        b.lo            1f
        str             w6,  [x0, #CNT]
        str             x7,  [x0, #DIF]
        add             w0,  w15, #\n          // ret
        ret
1:
        add             w15, w15, #\n          // ret
        b L(refill)
.endif
.endm

        decode_update   .4h, .8b, 4

L(refill):
        // refill
        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
        add             x5,  x3,  #8
        subs            x5,  x5,  x4
        b.hi            6f

        ldr             x8,  [x3]              // next_bits
        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
        mvn             x8,  x8
        neg             w5,  w4
        rev             x8,  x8                // next_bits = bswap(next_bits)
        lsr             w5,  w5,  #3           // num_bytes_read
        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)

2:      // refill_end
        add             x3,  x3,  x5
        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
        str             x3,  [x0, #BUF_POS]

3:      // refill_end2
        orr             x7,  x7,  x8           // dif |= next_bits

4:      // end
        str             w6,  [x0, #CNT]
        str             x7,  [x0, #DIF]

        mov             w0,  w15
        ret

5:      // pad_with_ones
        add             w8,  w6,  #-16
        ror             x8,  x8,  x8
        b               3b

6:      // refill_eob
        cmp             x3,  x4
        b.hs            5b

        ldr             x8,  [x4, #-8]
        lsl             w5,  w5,  #3
        lsr             x8,  x8,  x5
        add             w5,  w6,  #-48
        mvn             x8,  x8
        sub             w4,  w4,  w3           // num_bytes_left
        rev             x8,  x8
        lsr             x8,  x8,  x5
        neg             w5,  w5
        lsr             w5,  w5,  #3
        cmp             w5,  w4
        csel            w5,  w5,  w4,  lo      // num_bytes_read
        b               2b
endfunc

function msac_decode_symbol_adapt8_neon, export=1
        decode_update   .8h, .16b, 8
endfunc

function msac_decode_symbol_adapt16_neon, export=1
        decode_update   .8h, .16b, 16
endfunc

function msac_decode_hi_tok_neon, export=1
        ld1             {v0.4h},  [x1]            // cdf
        add             x16, x0,  #RNG
        movi            v31.4h, #0x7f, lsl #8     // 0x7f00
        movrel          x17, coeffs, COEFFS_BASE_OFFSET-2*3
        mvni            v30.4h, #0x3f             // 0xffc0
        ldrh            w9,  [x1, #6]             // count = cdf[n_symbols]
        ld1r            {v3.4h},  [x16]           // rng
        ld1             {v29.4h}, [x17]           // EC_MIN_PROB * (n_symbols - ret)
        add             x17, x0,  #DIF + 6
        mov             w13, #-24*8
        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
        ld1r            {v1.8h},  [x17]           // dif >> (EC_WIN_SIZE - 16)
        ldr             w6,  [x0, #CNT]
        ldr             x7,  [x0, #DIF]
1:
        and             v7.8b,   v3.8b,   v31.8b  // rng & 0x7f00
        sqdmulh         v6.4h,   v17.4h,  v7.4h   // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
        add             v4.4h,   v17.4h,  v29.4h  // v = cdf + EC_MIN_PROB * (n_symbols - ret)
        add             v4.4h,   v6.4h,   v4.4h   // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
        cmhs            v2.4h,   v1.4h,   v4.4h   // c >= v
        add             w13, w13, #5*8
        ext             v18.8b, v3.8b,  v4.8b, #6 // u
        umov            x15, v2.d[0]
        rev             x15, x15
        sub             v18.4h, v18.4h, v4.4h     // rng = u-v
        // rev + clz = count trailing zeros
        clz             x15, x15                  // 16*ret

        cbz             w10, 2f
        // update_cdf
        sub             v5.4h,   v0.4h,   v2.4h   // cdf[i] + (i >= val ? 1 : 0)
        mov             w4,  #-5
        orr             v2.4h, #0x80, lsl #8      // i >= val ? -1 : 32768
        sub             w4,  w4,  w9, lsr #4      // -((count >> 4) + 5)
        sub             v2.4h,   v2.4h,   v0.4h   // (32768 - cdf[i]) or (-1 - cdf[i])
        dup             v6.4h,    w4              // -rate

        sub             w9,  w9,  w9, lsr #5      // count - (count == 32)
        sshl            v2.4h,   v2.4h,   v6.4h   // ({32768,-1} - cdf[i]) >> rate
        add             w9,  w9,  #1              // count + (count < 32)
        add             v0.4h,   v5.4h,   v2.4h   // cdf[i] + (32768 - cdf[i]) >> rate
        st1             {v0.4h},  [x1]
        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
        strh            w9,  [x1, #6]

2:
        mov             x4,  v18.d[0]          // rng (packed)
        mov             x3,  v4.d[0]           // v (packed)

        // Shift 'v'/'rng' for ret into the 16 least sig bits. There is
        //  garbage in the remaining bits, but we can work around this.
        lsr             x4,  x4,  x15          // rng
        lsr             x3,  x3,  x15          // v
        lsl             w5,  w4,  #16          // rng << 16
        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
        clz             w5,  w5                // d = clz(rng << 16)
        lsl             w4,  w4,  w5           // rng << d
        subs            w6,  w6,  w5           // cnt -= d
        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
        strh            w4,  [x0, #RNG]
        dup             v3.4h,   w4
        b.hs            5f

        // refill
        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
        add             x5,  x3,  #8
        subs            x5,  x5,  x4
        b.hi            7f

        ldr             x8,  [x3]              // next_bits
        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
        mvn             x8,  x8
        neg             w5,  w4
        rev             x8,  x8                // next_bits = bswap(next_bits)
        lsr             w5,  w5,  #3           // num_bytes_read
        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)

3:      // refill_end
        add             x3,  x3,  x5
        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
        str             x3,  [x0, #BUF_POS]

4:      // refill_end2
        orr             x7,  x7,  x8           // dif |= next_bits

5:      // end
        sub             w15, w15, #5*8
        lsr             x12, x7,  #48
        adds            w13, w13, w15          // carry = tok_br < 3 || tok == 15
        dup             v1.8h,   w12
        b.cc            1b                     // loop if !carry
        add             w13, w13, #30*8
        str             w6,  [x0, #CNT]
        str             x7,  [x0, #DIF]
        lsr             w0,  w13, #4
        ret

6:      // pad_with_ones
        add             w8,  w6,  #-16
        ror             x8,  x8,  x8
        b               4b

7:      // refill_eob
        cmp             x3,  x4
        b.hs            6b

        ldr             x8,  [x4, #-8]
        lsl             w5,  w5,  #3
        lsr             x8,  x8,  x5
        add             w5,  w6,  #-48
        mvn             x8,  x8
        sub             w4,  w4,  w3           // num_bytes_left
        rev             x8,  x8
        lsr             x8,  x8,  x5
        neg             w5,  w5
        lsr             w5,  w5,  #3
        cmp             w5,  w4
        csel            w5,  w5,  w4,  lo      // num_bytes_read
        b               3b
endfunc

function msac_decode_bool_equi_neon, export=1
        ldp             w5,  w6,  [x0, #RNG]   // + CNT
        ldr             x7,  [x0, #DIF]
        bic             w4,  w5,  #0xff        // r &= 0xff00
        add             w4,  w4,  #8
        subs            x8,  x7,  x4, lsl #47  // dif - vw
        lsr             w4,  w4,  #1           // v
        sub             w5,  w5,  w4           // r - v
        cset            w15, lo
        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;

        clz             w5,  w4                // clz(rng)
        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
        lsl             w4,  w4,  w5           // rng << d
        subs            w6,  w6,  w5           // cnt -= d
        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
        str             w4,  [x0, #RNG]
        b.lo            L(refill)

        str             w6,  [x0, #CNT]
        str             x7,  [x0, #DIF]
        mov             w0,  w15
        ret
endfunc

function msac_decode_bool_neon, export=1
        ldp             w5,  w6,  [x0, #RNG]   // + CNT
        ldr             x7,  [x0, #DIF]
        lsr             w4,  w5,  #8           // r >> 8
        bic             w1,  w1,  #0x3f        // f &= ~63
        mul             w4,  w4,  w1
        lsr             w4,  w4,  #7
        add             w4,  w4,  #4           // v
        subs            x8,  x7,  x4, lsl #48  // dif - vw
        sub             w5,  w5,  w4           // r - v
        cset            w15, lo
        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;

        clz             w5,  w4                // clz(rng)
        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
        lsl             w4,  w4,  w5           // rng << d
        subs            w6,  w6,  w5           // cnt -= d
        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
        str             w4,  [x0, #RNG]
        b.lo            L(refill)

        str             w6,  [x0, #CNT]
        str             x7,  [x0, #DIF]
        mov             w0,  w15
        ret
endfunc

function msac_decode_bool_adapt_neon, export=1
        ldr             w9,  [x1]              // cdf[0-1]
        ldp             w5,  w6,  [x0, #RNG]   // + CNT
        ldr             x7,  [x0, #DIF]
        lsr             w4,  w5,  #8           // r >> 8
        and             w2,  w9,  #0xffc0      // f &= ~63
        mul             w4,  w4,  w2
        lsr             w4,  w4,  #7
        add             w4,  w4,  #4           // v
        subs            x8,  x7,  x4, lsl #48  // dif - vw
        sub             w5,  w5,  w4           // r - v
        cset            w15, lo
        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;

        ldr             w10, [x0, #ALLOW_UPDATE_CDF]

        clz             w5,  w4                // clz(rng)
        eor             w5,  w5,  #16          // d = clz(rng) ^ 16

        cbz             w10, 1f

        lsr             w2,  w9,  #16          // count = cdf[1]
        and             w9,  w9,  #0xffff      // cdf[0]

        sub             w3,  w2,  w2, lsr #5   // count - (count >= 32)
        lsr             w2,  w2,  #4           // count >> 4
        add             w10, w3,  #1           // count + (count < 32)
        add             w2,  w2,  #4           // rate = (count >> 4) | 4

        sub             w9,  w9,  w15          // cdf[0] -= bit
        sub             w11, w9,  w15, lsl #15 // {cdf[0], cdf[0] - 32769}
        asr             w11, w11, w2           // {cdf[0], cdf[0] - 32769} >> rate
        sub             w9,  w9,  w11          // cdf[0]

        strh            w9,  [x1]
        strh            w10, [x1, #2]

1:
        lsl             w4,  w4,  w5           // rng << d
        subs            w6,  w6,  w5           // cnt -= d
        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
        str             w4,  [x0, #RNG]
        b.lo            L(refill)

        str             w6,  [x0, #CNT]
        str             x7,  [x0, #DIF]
        mov             w0,  w15
        ret
endfunc

Messung V0.5 in Prozent

¤ Dauer der Verarbeitung: 0.16 Sekunden (vorverarbeitet am 2026-04-25) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.