Quelle looprestoration_tmpl.c Sprache: C

/*
* Copyright © 2018, VideoLAN and dav1d authors
* Copyright © 2018, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
*    list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
*    this list of conditions and the following disclaimer in the documentation
*    and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "config.h"

#include <stdint.h>
#include <stdlib.h>

#include "common/attributes.h"
#include "common/bitdepth.h"
#include "common/intops.h"

#include "src/looprestoration.h"
#include "src/tables.h"

// 256 * 1.5 + 3 + 3 = 390
#define REST_UNIT_STRIDE (390)

static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4],
                            const pixel *src, const int16_t fh[8],
                            const int w, const enum LrEdgeFlags edges
                            HIGHBD_DECL_SUFFIX)
{
    const int bitdepth = bitdepth_from_max(bitdepth_max);
    const int round_bits_h = 3 + (bitdepth == 12) * 2;
    const int rounding_off_h = 1 << (round_bits_h - 1);
    const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);

    if (w < 6) {
        // For small widths, do the fully conditional loop with
        // conditions on each access.
        for (int x = 0; x < w; x++) {
            int sum = (1 << (bitdepth + 6));
#if BITDEPTH == 8
            sum += src[x] * 128;
#endif
            for (int i = 0; i < 7; i++) {
                int idx = x + i - 3;
                if (idx < 0) {
                    if (!(edges & LR_HAVE_LEFT))
                        sum += src[0] * fh[i];
                    else if (left)
                        sum += left[0][4 + idx] * fh[i];
                    else
                        sum += src[idx] * fh[i];
                } else if (idx >= w && !(edges & LR_HAVE_RIGHT)) {
                    sum += src[w - 1] * fh[i];
                } else
                    sum += src[idx] * fh[i];
            }
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
            dst[x] = sum;
        }

        return;
    }

    // For larger widths, do separate loops with less conditions; first
    // handle the start of the row.
    int start = 3;
    if (!(edges & LR_HAVE_LEFT)) {
        // If there's no left edge, pad using the leftmost pixel.
        for (int x = 0; x < 3; x++) {
            int sum = (1 << (bitdepth + 6));
#if BITDEPTH == 8
            sum += src[x] * 128;
#endif
            for (int i = 0; i < 7; i++) {
                int idx = x + i - 3;
                if (idx < 0)
                    sum += src[0] * fh[i];
                else
                    sum += src[idx] * fh[i];
            }
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
            dst[x] = sum;
        }
    } else if (left) {
        // If we have the left edge and a separate left buffer, pad using that.
        for (int x = 0; x < 3; x++) {
            int sum = (1 << (bitdepth + 6));
#if BITDEPTH == 8
            sum += src[x] * 128;
#endif
            for (int i = 0; i < 7; i++) {
                int idx = x + i - 3;
                if (idx < 0)
                    sum += left[0][4 + idx] * fh[i];
                else
                    sum += src[idx] * fh[i];
            }
            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
            dst[x] = sum;
        }
    } else {
        // If we have the left edge, but no separate left buffer, we're in the
        // top/bottom area (lpf) with the left edge existing in the same
        // buffer; just do the regular loop from the start.
        start = 0;
    }
    int end = w - 3;
    if (edges & LR_HAVE_RIGHT)
        end = w;

    // Do a condititon free loop for the bulk of the row.
    for (int x = start; x < end; x++) {
        int sum = (1 << (bitdepth + 6));
#if BITDEPTH == 8
        sum += src[x] * 128;
#endif
        for (int i = 0; i < 7; i++) {
            int idx = x + i - 3;
            sum += src[idx] * fh[i];
        }
        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
        dst[x] = sum;
    }

    // If we need to, calculate the end of the row with a condition for
    // right edge padding.
    for (int x = end; x < w; x++) {
        int sum = (1 << (bitdepth + 6));
#if BITDEPTH == 8
        sum += src[x] * 128;
#endif
        for (int i = 0; i < 7; i++) {
            int idx = x + i - 3;
            if (idx >= w)
                sum += src[w - 1] * fh[i];
            else
                sum += src[idx] * fh[i];
        }
        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
        dst[x] = sum;
    }
}

static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8],
                            const int w HIGHBD_DECL_SUFFIX)
{
    const int bitdepth = bitdepth_from_max(bitdepth_max);

    const int round_bits_v = 11 - (bitdepth == 12) * 2;
    const int rounding_off_v = 1 << (round_bits_v - 1);
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));

    for (int i = 0; i < w; i++) {
        int sum = -round_offset;

        // Only filter using 6 input rows. The 7th row is assumed to be
        // identical to the last one.
        //
        // This function is assumed to only be called at the end, when doing
        // padding at the bottom.
        for (int k = 0; k < 6; k++)
            sum += ptrs[k][i] * fv[k];
        sum += ptrs[5][i] * fv[6];

        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
    }

    // Shift the pointers, but only update the first 5; the 6th pointer is kept
    // as it was before (and the 7th is implicitly identical to the 6th).
    for (int i = 0; i < 5; i++)
        ptrs[i] = ptrs[i + 1];
}

static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4],
                             const pixel *src, const int16_t filter[2][8],
                             const int w, const enum LrEdgeFlags edges
                             HIGHBD_DECL_SUFFIX)
{
    const int bitdepth = bitdepth_from_max(bitdepth_max);

    const int round_bits_v = 11 - (bitdepth == 12) * 2;
    const int rounding_off_v = 1 << (round_bits_v - 1);
    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));

    const int16_t *fh = filter[0];
    const int16_t *fv = filter[1];

    // Do combined horziontal and vertical filtering; doing horizontal
    // filtering of one row, combined with vertical filtering of 6
    // preexisting rows and the newly filtered row.

    // For simplicity in the C implementation, just do a separate call
    // of the horizontal filter, into a temporary buffer.
    uint16_t tmp[REST_UNIT_STRIDE];
    wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);

    for (int i = 0; i < w; i++) {
        int sum = -round_offset;

        // Filter using the 6 stored preexisting rows, and the newly
        // filtered one in tmp[].
        for (int k = 0; k < 6; k++)
            sum += ptrs[k][i] * fv[k];
        sum += tmp[i] * fv[6];
        // At this point, after having read all inputs at point [i], we
        // could overwrite [i] with the newly filtered data.

        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
    }

    // For simplicity in the C implementation, just memcpy the newly
    // filtered row into ptrs[6]. Normally, in steady state filtering,
    // this output row, ptrs[6], is equal to ptrs[0]. However at startup,
    // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1],
    // so we can't assume we can write into ptrs[0] but we need to keep
    // a separate pointer for the next row to write into.
    memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE);

    // Rotate the window of pointers. Shift the 6 pointers downwards one step.
    for (int i = 0; i < 6; i++)
        ptrs[i] = ptrs[i + 1];
    // The topmost pointer, ptrs[6], which isn't used as input, is set to
    // ptrs[0], which will be used as output for the next _hv call.
    // At the start of the filtering, the caller may set ptrs[6] to the
    // right next buffer to fill in, instead.
    ptrs[6] = ptrs[0];
}

// FIXME Could split into luma and chroma specific functions,
// (since first and last tops are always 0 for chroma)
static void wiener_c(pixel *p, const ptrdiff_t stride,
                     const pixel (*left)[4],
                     const pixel *lpf, const int w, int h,
                     const LooprestorationParams *const params,
                     const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
    // Values stored between horizontal and vertical filtering don't
    // fit in a uint8_t.
    uint16_t hor[6 * REST_UNIT_STRIDE];
    uint16_t *ptrs[7], *rows[6];
    for (int i = 0; i < 6; i++)
        rows[i] = &hor[i * REST_UNIT_STRIDE];
    const int16_t (*const filter)[8] = params->filter;
    const int16_t *fh = params->filter[0];
    const int16_t *fv = params->filter[1];
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);

    const pixel *src = p;
    if (edges & LR_HAVE_TOP) {
        ptrs[0] = rows[0];
        ptrs[1] = rows[0];
        ptrs[2] = rows[1];
        ptrs[3] = rows[2];
        ptrs[4] = rows[2];
        ptrs[5] = rows[2];

        wiener_filter_h(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
        lpf += PXSTRIDE(stride);
        wiener_filter_h(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);

        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
        left++;
        src += PXSTRIDE(stride);

        if (--h <= 0)
            goto v1;

        ptrs[4] = ptrs[5] = rows[3];
        wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
        left++;
        src += PXSTRIDE(stride);

        if (--h <= 0)
            goto v2;

        ptrs[5] = rows[4];
        wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
        left++;
        src += PXSTRIDE(stride);

        if (--h <= 0)
            goto v3;
    } else {
        ptrs[0] = rows[0];
        ptrs[1] = rows[0];
        ptrs[2] = rows[0];
        ptrs[3] = rows[0];
        ptrs[4] = rows[0];
        ptrs[5] = rows[0];

        wiener_filter_h(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
        left++;
        src += PXSTRIDE(stride);

        if (--h <= 0)
            goto v1;

        ptrs[4] = ptrs[5] = rows[1];
        wiener_filter_h(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
        left++;
        src += PXSTRIDE(stride);

        if (--h <= 0)
            goto v2;

        ptrs[5] = rows[2];
        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
        left++;
        src += PXSTRIDE(stride);

        if (--h <= 0)
            goto v3;

        ptrs[6] = rows[3];
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
                         HIGHBD_TAIL_SUFFIX);
        left++;
        src += PXSTRIDE(stride);
        p += PXSTRIDE(stride);

        if (--h <= 0)
            goto v3;

        ptrs[6] = rows[4];
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
                         HIGHBD_TAIL_SUFFIX);
        left++;
        src += PXSTRIDE(stride);
        p += PXSTRIDE(stride);

        if (--h <= 0)
            goto v3;
    }

    ptrs[6] = ptrs[5] + REST_UNIT_STRIDE;
    do {
        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
                         HIGHBD_TAIL_SUFFIX);
        left++;
        src += PXSTRIDE(stride);
        p += PXSTRIDE(stride);
    } while (--h > 0);

    if (!(edges & LR_HAVE_BOTTOM))
        goto v3;

    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
                     HIGHBD_TAIL_SUFFIX);
    lpf_bottom += PXSTRIDE(stride);
    p += PXSTRIDE(stride);

    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
                     HIGHBD_TAIL_SUFFIX);
    p += PXSTRIDE(stride);
v1:
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);

    return;

v3:
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
    p += PXSTRIDE(stride);
v2:
    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
    p += PXSTRIDE(stride);
    goto v1;
}

// SGR
static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n)
{
    int32_t *tmp32 = sumsq_ptrs[0];
    coef *tmpc = sum_ptrs[0];
    for (int i = 0; i < n - 1; i++) {
        sumsq_ptrs[i] = sumsq_ptrs[i + 1];
        sum_ptrs[i] = sum_ptrs[i + 1];
    }
    sumsq_ptrs[n - 1] = tmp32;
    sum_ptrs[n - 1] = tmpc;
}

static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs)
{
    int32_t *tmp32[2];
    coef *tmpc[2];
    for (int i = 0; i < 2; i++) {
        tmp32[i] = sumsq_ptrs[i];
        tmpc[i] = sum_ptrs[i];
    }
    for (int i = 0; i < 3; i++) {
        sumsq_ptrs[i] = sumsq_ptrs[i + 2];
        sum_ptrs[i] = sum_ptrs[i + 2];
    }
    for (int i = 0; i < 2; i++) {
        sumsq_ptrs[3 + i] = tmp32[i];
        sum_ptrs[3 + i] = tmpc[i];
    }
}

static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum,
                                    const pixel (*left)[4],
                                    const pixel *src, const int w,
                                    const enum LrEdgeFlags edges)
{
    sumsq++;
    sum++;
    int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
    int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
    for (int x = -1; x < w + 1; x++) {
        int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1];
        sum[x] = a + b + c;
        sumsq[x] = a * a + b * b + c * c;
        a = b;
        b = c;
    }
}

static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum,
                                    const pixel (*left)[4],
                                    const pixel *src, const int w,
                                    const enum LrEdgeFlags edges)
{
    sumsq++;
    sum++;
    int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0];
    int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
    int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
    int d = src[0];
    for (int x = -1; x < w + 1; x++) {
        int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1];
        sum[x] = a + b + c + d + e;
        sumsq[x] = a * a + b * b + c * c + d * d + e * e;
        a = b;
        b = c;
        c = d;
        d = e;
    }
}

static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3,
                            int32_t *sumsq5, coef *sum5,
                            const pixel (*left)[4],
                            const pixel *src, const int w,
                            const enum LrEdgeFlags edges)
{
    sgr_box3_row_h(sumsq3, sum3, left, src, w, edges);
    sgr_box5_row_h(sumsq5, sum5, left, src, w, edges);
}

static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum,
                                    int32_t *sumsq_out, coef *sum_out,
                                    const int w)
{
    for (int x = 0; x < w + 2; x++) {
        int sq_a = sumsq[0][x];
        int sq_b = sumsq[1][x];
        int sq_c = sumsq[2][x];
        int s_a = sum[0][x];
        int s_b = sum[1][x];
        int s_c = sum[2][x];
        sumsq_out[x] = sq_a + sq_b + sq_c;
        sum_out[x] = s_a + s_b + s_c;
    }
}

static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum,
                                    int32_t *sumsq_out, coef *sum_out,
                                    const int w)
{
    for (int x = 0; x < w + 2; x++) {
        int sq_a = sumsq[0][x];
        int sq_b = sumsq[1][x];
        int sq_c = sumsq[2][x];
        int sq_d = sumsq[3][x];
        int sq_e = sumsq[4][x];
        int s_a = sum[0][x];
        int s_b = sum[1][x];
        int s_c = sum[2][x];
        int s_d = sum[3][x];
        int s_e = sum[4][x];
        sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e;
        sum_out[x] = s_a + s_b + s_c + s_d + s_e;
    }
}

static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s,
                                     int bitdepth_max, int n, int sgr_one_by_x)
{
    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    for (int i = 0; i < w + 2; i++) {
        const int a =
            (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
        const int b =
            (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;

        const unsigned p = imax(a * n - b * b, 0);
        const unsigned z = (p * s + (1 << 19)) >> 20;
        const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];

        // This is where we invert A and B, so that B is of size coef.
        AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
        BB[i] = x;
    }
}

static void sgr_box3_vert(int32_t **sumsq, coef **sum,
                          int32_t *sumsq_out, coef *sum_out,
                          const int w, const int s, const int bitdepth_max)
{
    sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w);
    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455);
    rotate(sumsq, sum, 3);
}

static void sgr_box5_vert(int32_t **sumsq, coef **sum,
                          int32_t *sumsq_out, coef *sum_out,
                          const int w, const int s, const int bitdepth_max)
{
    sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w);
    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164);
    rotate5_x2(sumsq, sum);
}

static void sgr_box3_hv(int32_t **sumsq, coef **sum,
                        int32_t *AA, coef *BB,
                        const pixel (*left)[4],
                        const pixel *src, const int w,
                        const int s,
                        const enum LrEdgeFlags edges,
                        const int bitdepth_max)
{
    sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges);
    sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max);
}

static NOINLINE void sgr_finish_filter_row1(coef *tmp,
                                            const pixel *src,
                                            int32_t **A_ptrs, coef **B_ptrs,
                                            const int w)
{
#define EIGHT_NEIGHBORS(P, i)\
    ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \
     (P[0][i - 1] + P[2][i - 1] +                           \
      P[0][i + 1] + P[2][i + 1]) * 3)
    for (int i = 0; i < w; i++) {
        const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1);
        const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1);
        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
    }
#undef EIGHT_NEIGHBORS
}

#define FILTER_OUT_STRIDE (384)

static NOINLINE void sgr_finish_filter2(coef *tmp,
                                        const pixel *src,
                                        const ptrdiff_t src_stride,
                                        int32_t **A_ptrs, coef **B_ptrs,
                                        const int w, const int h)
{
#define SIX_NEIGHBORS(P, i)\
    ((P[0][i]     + P[1][i]) * 6 +   \
     (P[0][i - 1] + P[1][i - 1] +    \
      P[0][i + 1] + P[1][i + 1]) * 5)
    for (int i = 0; i < w; i++) {
        const int a = SIX_NEIGHBORS(B_ptrs, i + 1);
        const int b = SIX_NEIGHBORS(A_ptrs, i + 1);
        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
    }
    if (h <= 1)
        return;
    tmp += FILTER_OUT_STRIDE;
    src += PXSTRIDE(src_stride);
    const int32_t *A = &A_ptrs[1][1];
    const coef *B = &B_ptrs[1][1];
    for (int i = 0; i < w; i++) {
        const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
        const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
        tmp[i] = (b - a * src[i] + (1 << 7)) >> 8;
    }
#undef SIX_NEIGHBORS
}

static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1,
                                       const int w, const int w1 HIGHBD_DECL_SUFFIX)
{
    for (int i = 0; i < w; i++) {
        const int v = w1 * t1[i];
        dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
    }
}

static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride,
                                   const coef *t1, const coef *t2,
                                   const int w, const int h,
                                   const int w0, const int w1 HIGHBD_DECL_SUFFIX)
{
    for (int j = 0; j < h; j++) {
        for (int i = 0; i < w; i++) {
            const int v = w0 * t1[i] + w1 * t2[i];
            dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
        }
        dst += PXSTRIDE(dst_stride);
        t1 += FILTER_OUT_STRIDE;
        t2 += FILTER_OUT_STRIDE;
    }
}

static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride,
                                 int32_t **A_ptrs, coef **B_ptrs, const int w,
                                 const int w1 HIGHBD_DECL_SUFFIX)
{
    // Only one single row, no stride needed
    ALIGN_STK_16(coef, tmp, 384,);

    sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w);
    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
    *dst += PXSTRIDE(stride);
    rotate(A_ptrs, B_ptrs, 3);
}

static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride,
                                 int32_t **A_ptrs, coef **B_ptrs,
                                 const int w, const int h, const int w1
                                 HIGHBD_DECL_SUFFIX)
{
    ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,);

    sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h);
    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
    *dst += PXSTRIDE(stride);
    if (h > 1) {
        sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX);
        *dst += PXSTRIDE(stride);
    }
    rotate(A_ptrs, B_ptrs, 2);
}

static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride,
                                    int32_t **A5_ptrs, coef **B5_ptrs,
                                    int32_t **A3_ptrs, coef **B3_ptrs,
                                    const int w, const int h,
                                    const int w0, const int w1 HIGHBD_DECL_SUFFIX)
{
    ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,);
    ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,);

    sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h);
    sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w);
    if (h > 1)
        sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride),
                               &A3_ptrs[1], &B3_ptrs[1], w);
    sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX);
    *dst += h*PXSTRIDE(stride);
    rotate(A5_ptrs, B5_ptrs, 2);
    rotate(A3_ptrs, B3_ptrs, 4);
}

static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride,
                      const pixel (*left)[4], const pixel *lpf,
                      const int w, int h,
                      const LooprestorationParams *const params,
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
#define BUF_STRIDE (384 + 16)
    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,);
    int32_t *sumsq_ptrs[3], *sumsq_rows[3];
    coef *sum_ptrs[3], *sum_rows[3];
    for (int i = 0; i < 3; i++) {
        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
    }

    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,);
    int32_t *A_ptrs[3];
    coef *B_ptrs[3];
    for (int i = 0; i < 3; i++) {
        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
    }
    const pixel *src = dst;
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);

    if (edges & LR_HAVE_TOP) {
        sumsq_ptrs[0] = sumsq_rows[0];
        sumsq_ptrs[1] = sumsq_rows[1];
        sumsq_ptrs[2] = sumsq_rows[2];
        sum_ptrs[0] = sum_rows[0];
        sum_ptrs[1] = sum_rows[1];
        sum_ptrs[2] = sum_rows[2];

        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
        lpf += PXSTRIDE(stride);
        sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);

        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
        left++;
        src += PXSTRIDE(stride);
        rotate(A_ptrs, B_ptrs, 3);

        if (--h <= 0)
            goto vert_1;

        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
        left++;
        src += PXSTRIDE(stride);
        rotate(A_ptrs, B_ptrs, 3);

        if (--h <= 0)
            goto vert_2;
    } else {
        sumsq_ptrs[0] = sumsq_rows[0];
        sumsq_ptrs[1] = sumsq_rows[0];
        sumsq_ptrs[2] = sumsq_rows[0];
        sum_ptrs[0] = sum_rows[0];
        sum_ptrs[1] = sum_rows[0];
        sum_ptrs[2] = sum_rows[0];

        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
                      w, params->sgr.s1, BITDEPTH_MAX);
        rotate(A_ptrs, B_ptrs, 3);

        if (--h <= 0)
            goto vert_1;

        sumsq_ptrs[2] = sumsq_rows[1];
        sum_ptrs[2] = sum_rows[1];

        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
        left++;
        src += PXSTRIDE(stride);
        rotate(A_ptrs, B_ptrs, 3);

        if (--h <= 0)
            goto vert_2;

        sumsq_ptrs[2] = sumsq_rows[2];
        sum_ptrs[2] = sum_rows[2];
    }

    do {
        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
        left++;
        src += PXSTRIDE(stride);

        sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
                    w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    } while (--h > 0);

    if (!(edges & LR_HAVE_BOTTOM))
        goto vert_2;

    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
    lpf_bottom += PXSTRIDE(stride);

    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);

    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);

    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    return;

vert_2:
    sumsq_ptrs[2] = sumsq_ptrs[1];
    sum_ptrs[2] = sum_ptrs[1];
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
                  w, params->sgr.s1, BITDEPTH_MAX);

    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);

output_1:
    sumsq_ptrs[2] = sumsq_ptrs[1];
    sum_ptrs[2] = sum_ptrs[1];
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
                  w, params->sgr.s1, BITDEPTH_MAX);

    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    return;

vert_1:
    sumsq_ptrs[2] = sumsq_ptrs[1];
    sum_ptrs[2] = sum_ptrs[1];
    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
                  w, params->sgr.s1, BITDEPTH_MAX);
    rotate(A_ptrs, B_ptrs, 3);
    goto output_1;
}

static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride,
                      const pixel (*left)[4], const pixel *lpf,
                      const int w, int h,
                      const LooprestorationParams *const params,
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,);
    int32_t *sumsq_ptrs[5], *sumsq_rows[5];
    coef *sum_ptrs[5], *sum_rows[5];
    for (int i = 0; i < 5; i++) {
        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
    }

    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,);
    int32_t *A_ptrs[2];
    coef *B_ptrs[2];
    for (int i = 0; i < 2; i++) {
        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
    }
    const pixel *src = dst;
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);

    if (edges & LR_HAVE_TOP) {
        sumsq_ptrs[0] = sumsq_rows[0];
        sumsq_ptrs[1] = sumsq_rows[0];
        sumsq_ptrs[2] = sumsq_rows[1];
        sumsq_ptrs[3] = sumsq_rows[2];
        sumsq_ptrs[4] = sumsq_rows[3];
        sum_ptrs[0] = sum_rows[0];
        sum_ptrs[1] = sum_rows[0];
        sum_ptrs[2] = sum_rows[1];
        sum_ptrs[3] = sum_rows[2];
        sum_ptrs[4] = sum_rows[3];

        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
        lpf += PXSTRIDE(stride);
        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);

        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        if (--h <= 0)
            goto vert_1;

        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);
        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
                      w, params->sgr.s0, BITDEPTH_MAX);
        rotate(A_ptrs, B_ptrs, 2);

        if (--h <= 0)
            goto vert_2;

        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
        // one of them to point at the previously unused rows[4].
        sumsq_ptrs[3] = sumsq_rows[4];
        sum_ptrs[3] = sum_rows[4];
    } else {
        sumsq_ptrs[0] = sumsq_rows[0];
        sumsq_ptrs[1] = sumsq_rows[0];
        sumsq_ptrs[2] = sumsq_rows[0];
        sumsq_ptrs[3] = sumsq_rows[0];
        sumsq_ptrs[4] = sumsq_rows[0];
        sum_ptrs[0] = sum_rows[0];
        sum_ptrs[1] = sum_rows[0];
        sum_ptrs[2] = sum_rows[0];
        sum_ptrs[3] = sum_rows[0];
        sum_ptrs[4] = sum_rows[0];

        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        if (--h <= 0)
            goto vert_1;

        sumsq_ptrs[4] = sumsq_rows[1];
        sum_ptrs[4] = sum_rows[1];

        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
                      w, params->sgr.s0, BITDEPTH_MAX);
        rotate(A_ptrs, B_ptrs, 2);

        if (--h <= 0)
            goto vert_2;

        sumsq_ptrs[3] = sumsq_rows[2];
        sumsq_ptrs[4] = sumsq_rows[3];
        sum_ptrs[3] = sum_rows[2];
        sum_ptrs[4] = sum_rows[3];

        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        if (--h <= 0)
            goto odd;

        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
                      w, params->sgr.s0, BITDEPTH_MAX);
        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);

        if (--h <= 0)
            goto vert_2;

        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
        // one of them to point at the previously unused rows[4].
        sumsq_ptrs[3] = sumsq_rows[4];
        sum_ptrs[3] = sum_rows[4];
    }

    do {
        sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        if (--h <= 0)
            goto odd;

        sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
                      w, params->sgr.s0, BITDEPTH_MAX);
        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    } while (--h > 0);

    if (!(edges & LR_HAVE_BOTTOM))
        goto vert_2;

    sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges);
    lpf_bottom += PXSTRIDE(stride);
    sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges);

output_2:
    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
                  w, params->sgr.s0, BITDEPTH_MAX);
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    return;

vert_2:
    // Duplicate the last row twice more
    sumsq_ptrs[3] = sumsq_ptrs[2];
    sumsq_ptrs[4] = sumsq_ptrs[2];
    sum_ptrs[3] = sum_ptrs[2];
    sum_ptrs[4] = sum_ptrs[2];
    goto output_2;

odd:
    // Copy the last row as padding once
    sumsq_ptrs[4] = sumsq_ptrs[3];
    sum_ptrs[4] = sum_ptrs[3];

    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
                  w, params->sgr.s0, BITDEPTH_MAX);
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);

output_1:
    // Duplicate the last row twice more
    sumsq_ptrs[3] = sumsq_ptrs[2];
    sumsq_ptrs[4] = sumsq_ptrs[2];
    sum_ptrs[3] = sum_ptrs[2];
    sum_ptrs[4] = sum_ptrs[2];

    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
                  w, params->sgr.s0, BITDEPTH_MAX);
    // Output only one row
    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
                w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    return;

vert_1:
    // Copy the last row as padding once
    sumsq_ptrs[4] = sumsq_ptrs[3];
    sum_ptrs[4] = sum_ptrs[3];

    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
                  w, params->sgr.s0, BITDEPTH_MAX);
    rotate(A_ptrs, B_ptrs, 2);

    goto output_1;
}

static void sgr_mix_c(pixel *dst, const ptrdiff_t stride,
                      const pixel (*left)[4], const pixel *lpf,
                      const int w, int h,
                      const LooprestorationParams *const params,
                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
{
    ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
    ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,);
    int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
    coef *sum5_ptrs[5], *sum5_rows[5];
    for (int i = 0; i < 5; i++) {
        sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
        sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
    }
    ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
    ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,);
    int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
    coef *sum3_ptrs[3], *sum3_rows[3];
    for (int i = 0; i < 3; i++) {
        sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
        sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
    }

    ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
    ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,);
    int32_t *A5_ptrs[2];
    coef *B5_ptrs[2];
    for (int i = 0; i < 2; i++) {
        A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
        B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
    }
    ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
    ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,);
    int32_t *A3_ptrs[4];
    coef *B3_ptrs[4];
    for (int i = 0; i < 4; i++) {
        A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
        B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
    }
    const pixel *src = dst;
    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);

    if (edges & LR_HAVE_TOP) {
        sumsq5_ptrs[0] = sumsq5_rows[0];
        sumsq5_ptrs[1] = sumsq5_rows[0];
        sumsq5_ptrs[2] = sumsq5_rows[1];
        sumsq5_ptrs[3] = sumsq5_rows[2];
        sumsq5_ptrs[4] = sumsq5_rows[3];
        sum5_ptrs[0] = sum5_rows[0];
        sum5_ptrs[1] = sum5_rows[0];
        sum5_ptrs[2] = sum5_rows[1];
        sum5_ptrs[3] = sum5_rows[2];
        sum5_ptrs[4] = sum5_rows[3];

        sumsq3_ptrs[0] = sumsq3_rows[0];
        sumsq3_ptrs[1] = sumsq3_rows[1];
        sumsq3_ptrs[2] = sumsq3_rows[2];
        sum3_ptrs[0] = sum3_rows[0];
        sum3_ptrs[1] = sum3_rows[1];
        sum3_ptrs[2] = sum3_rows[2];

        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
                        sumsq5_rows[0], sum5_rows[0],
                        NULL, lpf, w, edges);
        lpf += PXSTRIDE(stride);
        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
                        sumsq5_rows[1], sum5_rows[1],
                        NULL, lpf, w, edges);

        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
                        sumsq5_rows[2], sum5_rows[2],
                        left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                      w, params->sgr.s1, BITDEPTH_MAX);
        rotate(A3_ptrs, B3_ptrs, 4);

        if (--h <= 0)
            goto vert_1;

        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
                        sumsq5_rows[3], sum5_rows[3],
                        left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);
        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
                      w, params->sgr.s0, BITDEPTH_MAX);
        rotate(A5_ptrs, B5_ptrs, 2);
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                      w, params->sgr.s1, BITDEPTH_MAX);
        rotate(A3_ptrs, B3_ptrs, 4);

        if (--h <= 0)
            goto vert_2;

        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
        // one of them to point at the previously unused rows[4].
        sumsq5_ptrs[3] = sumsq5_rows[4];
        sum5_ptrs[3] = sum5_rows[4];
    } else {
        sumsq5_ptrs[0] = sumsq5_rows[0];
        sumsq5_ptrs[1] = sumsq5_rows[0];
        sumsq5_ptrs[2] = sumsq5_rows[0];
        sumsq5_ptrs[3] = sumsq5_rows[0];
        sumsq5_ptrs[4] = sumsq5_rows[0];
        sum5_ptrs[0] = sum5_rows[0];
        sum5_ptrs[1] = sum5_rows[0];
        sum5_ptrs[2] = sum5_rows[0];
        sum5_ptrs[3] = sum5_rows[0];
        sum5_ptrs[4] = sum5_rows[0];

        sumsq3_ptrs[0] = sumsq3_rows[0];
        sumsq3_ptrs[1] = sumsq3_rows[0];
        sumsq3_ptrs[2] = sumsq3_rows[0];
        sum3_ptrs[0] = sum3_rows[0];
        sum3_ptrs[1] = sum3_rows[0];
        sum3_ptrs[2] = sum3_rows[0];

        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
                        sumsq5_rows[0], sum5_rows[0],
                        left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                      w, params->sgr.s1, BITDEPTH_MAX);
        rotate(A3_ptrs, B3_ptrs, 4);

        if (--h <= 0)
            goto vert_1;

        sumsq5_ptrs[4] = sumsq5_rows[1];
        sum5_ptrs[4] = sum5_rows[1];

        sumsq3_ptrs[2] = sumsq3_rows[1];
        sum3_ptrs[2] = sum3_rows[1];

        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
                        sumsq5_rows[1], sum5_rows[1],
                        left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
                      w, params->sgr.s0, BITDEPTH_MAX);
        rotate(A5_ptrs, B5_ptrs, 2);
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                      w, params->sgr.s1, BITDEPTH_MAX);
        rotate(A3_ptrs, B3_ptrs, 4);

        if (--h <= 0)
            goto vert_2;

        sumsq5_ptrs[3] = sumsq5_rows[2];
        sumsq5_ptrs[4] = sumsq5_rows[3];
        sum5_ptrs[3] = sum5_rows[2];
        sum5_ptrs[4] = sum5_rows[3];

        sumsq3_ptrs[2] = sumsq3_rows[2];
        sum3_ptrs[2] = sum3_rows[2];

        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
                        sumsq5_rows[2], sum5_rows[2],
                        left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                      w, params->sgr.s1, BITDEPTH_MAX);
        rotate(A3_ptrs, B3_ptrs, 4);

        if (--h <= 0)
            goto odd;

        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
                        sumsq5_rows[3], sum5_rows[3],
                        left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
                      w, params->sgr.s0, BITDEPTH_MAX);
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                      w, params->sgr.s1, BITDEPTH_MAX);
        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
                       w, 2, params->sgr.w0, params->sgr.w1
                       HIGHBD_TAIL_SUFFIX);

        if (--h <= 0)
            goto vert_2;

        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
        // one of them to point at the previously unused rows[4].
        sumsq5_ptrs[3] = sumsq5_rows[4];
        sum5_ptrs[3] = sum5_rows[4];
    }

    do {
        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
                        sumsq5_ptrs[3], sum5_ptrs[3],
                        left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                      w, params->sgr.s1, BITDEPTH_MAX);
        rotate(A3_ptrs, B3_ptrs, 4);

        if (--h <= 0)
            goto odd;

        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
                        sumsq5_ptrs[4], sum5_ptrs[4],
                        left, src, w, edges);
        left++;
        src += PXSTRIDE(stride);

        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
                      w, params->sgr.s0, BITDEPTH_MAX);
        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                      w, params->sgr.s1, BITDEPTH_MAX);
        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
                       w, 2, params->sgr.w0, params->sgr.w1
                       HIGHBD_TAIL_SUFFIX);
    } while (--h > 0);

    if (!(edges & LR_HAVE_BOTTOM))
        goto vert_2;

    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
                    sumsq5_ptrs[3], sum5_ptrs[3],
                    NULL, lpf_bottom, w, edges);
    lpf_bottom += PXSTRIDE(stride);
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                  w, params->sgr.s1, BITDEPTH_MAX);
    rotate(A3_ptrs, B3_ptrs, 4);

    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
                    sumsq5_ptrs[4], sum5_ptrs[4],
                    NULL, lpf_bottom, w, edges);

output_2:
    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
                  w, params->sgr.s0, BITDEPTH_MAX);
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                  w, params->sgr.s1, BITDEPTH_MAX);
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
                   w, 2, params->sgr.w0, params->sgr.w1
                   HIGHBD_TAIL_SUFFIX);
    return;

vert_2:
    // Duplicate the last row twice more
    sumsq5_ptrs[3] = sumsq5_ptrs[2];
    sumsq5_ptrs[4] = sumsq5_ptrs[2];
    sum5_ptrs[3] = sum5_ptrs[2];
    sum5_ptrs[4] = sum5_ptrs[2];

    sumsq3_ptrs[2] = sumsq3_ptrs[1];
    sum3_ptrs[2] = sum3_ptrs[1];
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                  w, params->sgr.s1, BITDEPTH_MAX);
    rotate(A3_ptrs, B3_ptrs, 4);

    sumsq3_ptrs[2] = sumsq3_ptrs[1];
    sum3_ptrs[2] = sum3_ptrs[1];

    goto output_2;

odd:
    // Copy the last row as padding once
    sumsq5_ptrs[4] = sumsq5_ptrs[3];
    sum5_ptrs[4] = sum5_ptrs[3];

    sumsq3_ptrs[2] = sumsq3_ptrs[1];
    sum3_ptrs[2] = sum3_ptrs[1];

    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
                  w, params->sgr.s0, BITDEPTH_MAX);
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                  w, params->sgr.s1, BITDEPTH_MAX);
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
                   w, 2, params->sgr.w0, params->sgr.w1
                   HIGHBD_TAIL_SUFFIX);

output_1:
    // Duplicate the last row twice more
    sumsq5_ptrs[3] = sumsq5_ptrs[2];
    sumsq5_ptrs[4] = sumsq5_ptrs[2];
    sum5_ptrs[3] = sum5_ptrs[2];
    sum5_ptrs[4] = sum5_ptrs[2];

    sumsq3_ptrs[2] = sumsq3_ptrs[1];
    sum3_ptrs[2] = sum3_ptrs[1];

    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
                  w, params->sgr.s0, BITDEPTH_MAX);
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                  w, params->sgr.s1, BITDEPTH_MAX);
    rotate(A3_ptrs, B3_ptrs, 4);
    // Output only one row
    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
                   w, 1, params->sgr.w0, params->sgr.w1
                   HIGHBD_TAIL_SUFFIX);
    return;

vert_1:
    // Copy the last row as padding once
    sumsq5_ptrs[4] = sumsq5_ptrs[3];
    sum5_ptrs[4] = sum5_ptrs[3];

    sumsq3_ptrs[2] = sumsq3_ptrs[1];
    sum3_ptrs[2] = sum3_ptrs[1];

    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
                  w, params->sgr.s0, BITDEPTH_MAX);
    rotate(A5_ptrs, B5_ptrs, 2);
    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
                  w, params->sgr.s1, BITDEPTH_MAX);
    rotate(A3_ptrs, B3_ptrs, 4);

    goto output_1;
}

#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
#include "src/arm/looprestoration.h"
#elif ARCH_LOONGARCH64
#include "src/loongarch/looprestoration.h"
#elif ARCH_PPC64LE
#include "src/ppc/looprestoration.h"
#elif ARCH_X86
#include "src/x86/looprestoration.h"
#endif
#endif

COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
                                                 const int bpc)
{
    c->wiener[0] = c->wiener[1] = wiener_c;
    c->sgr[0] = sgr_5x5_c;
    c->sgr[1] = sgr_3x3_c;
    c->sgr[2] = sgr_mix_c;

#if HAVE_ASM
#if ARCH_AARCH64 || ARCH_ARM
    loop_restoration_dsp_init_arm(c, bpc);
#elif ARCH_LOONGARCH64
    loop_restoration_dsp_init_loongarch(c, bpc);
#elif ARCH_PPC64LE
    loop_restoration_dsp_init_ppc(c, bpc);
#elif ARCH_X86
    loop_restoration_dsp_init_x86(c, bpc);
#endif
#endif
}

Messung V0.5

¤ Dauer der Verarbeitung: 0.21 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.