/* * Copyright (c) 2017 Google Inc. * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
// The input to and output from this macro is in the registers v16-v31,
// and v0-v7 are used as scratch registers.
// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
// Depending on the width of the loop filter, we either use v16-v19
// and v28-v31 as temp registers, or v8-v15.
.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
dup v0.8h, w2 // E
dup v2.8h, w3 // I
dup v3.8h, w4 // H
umax v7.8h, v7.8h, v2.8h
umax v1.8h, v1.8h, v8.8h
umax v9.8h, v9.8h, v10.8h
umax v11.8h, v11.8h, v12.8h
// The rest of the calculation of flat8out is interleaved below
.else
// The rest of the calculation of flat8in is interleaved below
.endif
.endif
add v0.8h, v23.8h, \tmp4\().8h // p0 + f2 sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1
smin v0.8h, v0.8h, \tmp6\().8h
smin v2.8h, v2.8h, \tmp6\().8h
srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1
smax v0.8h, v0.8h, \tmp5\().8h // out p0
smax v2.8h, v2.8h, \tmp5\().8h // out q0
bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in)
bit v24.16b, v2.16b, v4.16b
add v0.8h, v22.8h, \tmp3\().8h // p1 + f sub v2.8h, v25.8h, \tmp3\().8h // q1 - f
.if \wd >= 8
mov x11, v6.d[0]
.endif
smin v0.8h, v0.8h, \tmp6\().8h
smin v2.8h, v2.8h, \tmp6\().8h
.if \wd >= 8
mov x12, v6.d[1]
.endif
smax v0.8h, v0.8h, \tmp5\().8h // out p1
smax v2.8h, v2.8h, \tmp5\().8h // out q1
.if \wd >= 8
adds x11, x11, x12
.endif
bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in)
bit v25.16b, v2.16b, v5.16b
// If no pixels need flat8in, jump to flat8out
// (or to a writeout of the inner 4 pixels, for wd=8)
.if \wd >= 8
.if \wd == 16
b.eq 6f
.else
b.ne 1f
ret x13
1:
.endif
add v0.8h, v0.8h, \tmp7\().8h sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
urshr \tmp5\().8h, v0.8h, #3 // out q1
add v0.8h, v0.8h, \tmp3\().8h
// The output here is written back into the input registers. This doesn't
// matter for the flat8part below, since we only update those pixels
// which won't be touched below.
bit v21.16b, v2.16b, v6.16b
bit v22.16b, v3.16b, v6.16b
bit v23.16b, v4.16b, v6.16b
urshr \tmp6\().8h, v0.8h, #3 // out q2
bit v24.16b, v5.16b, v6.16b
bit v25.16b, \tmp5\().16b, v6.16b
bit v26.16b, \tmp6\().16b, v6.16b
.endif
.if \wd == 16
6:
orr v2.16b, v6.16b, v7.16b
mov x11, v2.d[0]
mov x12, v2.d[1]
adds x11, x11, x12
b.ne 1f
// If no pixels needed flat8in nor flat8out, jump to a
// writeout of the inner 4 pixels
ret x14
1:
mov x11, v7.d[0]
mov x12, v7.d[1]
adds x11, x11, x12
b.ne 1f
// If no pixels need flat8out, jump to a writeout of the inner 6 pixels
ret x15
1:
// flat8out
// This writes all outputs into v2-v17 (skipping v6 and v16).
// If this part is skipped, the output is read from v21-v26 (which is the input
// to this section).
shl v0.8h, v16.8h, #3 // 8 * v16 sub v0.8h, v0.8h, v16.8h // 7 * v16
add v0.8h, v0.8h, v17.8h
add v8.8h, v17.8h, v18.8h
add v10.8h, v19.8h, v20.8h
add v0.8h, v0.8h, v8.8h
add v8.8h, v16.8h, v17.8h
add v12.8h, v21.8h, v22.8h
add v0.8h, v0.8h, v10.8h
add v10.8h, v18.8h, v25.8h
add v14.8h, v23.8h, v24.8h sub v10.8h, v10.8h, v8.8h
add v0.8h, v0.8h, v12.8h
add v0.8h, v0.8h, v14.8h
add v12.8h, v16.8h, v18.8h
add v14.8h, v19.8h, v26.8h
urshr v2.8h, v0.8h, #4
// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
// while we need those for inputs/outputs in wd=16 and use v8-v15
// for temp registers there instead.
function vp9_loop_filter_4
loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31
ret
endfunc
function vp9_loop_filter_8
loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31
ret
endfunc
function vp9_loop_filter_16
loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15
ret
endfunc
// The public functions in this file have got the following signature:
// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
// Move x9 forward by 2 pixels; we don't need to rewrite the
// outermost 2 pixels since they aren't changed.
add x9, x9, #4
add x0, x9, x1, lsl #2
// We only will write the mid 4 pixels back; after the loop filter,
// these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
// We need to transpose them to columns, done with a 4x8 transpose
// (which in practice is two 4x4 transposes of the two 4x4 halves
// of the 8x4 pixels; into 4x8 pixels).
transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29
st1 {v22.d}[0], [x9], x1
st1 {v22.d}[1], [x0], x1
st1 {v23.d}[0], [x9], x1
st1 {v23.d}[1], [x0], x1
st1 {v24.d}[0], [x9], x1
st1 {v24.d}[1], [x0], x1
st1 {v25.d}[0], [x9], x1
st1 {v25.d}[1], [x0], x1 sub x0, x0, x1, lsl #3
add x0, x0, #4
// Even though only 6 pixels per row have been changed, we write the
// full 8 pixel registers.
transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
// The 16x8 pixels read above is in two 8x8 blocks; the left
// half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
// of this, to get one column per register.
transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.