// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
// left == NULL sub x3, x3, #3
ld1 {v3.16b}, [x3], #16
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v3.16b}, [x3], #16
ld1 {v2.s}[3], [x2], #4
// Move x3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub x3, x3, #3
ext v3.16b, v2.16b, v3.16b, #13
b 2f
1:
ld1 {v3.16b}, [x3], #16
// !LR_HAVE_LEFT, fill v2 with the leftmost byte
// and shift v3 to have 3x the first byte at the front.
dup v2.16b, v3.b[0]
// Move x3 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub x3, x3, #3
ext v3.16b, v2.16b, v3.16b, #13
// Check whether we need to pad the right edge
cmp w4, #19
b.ge 4f // If w >= 19, all used input pixels are valid
// 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. sub w17, w4, #22
// Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel x6, right_ext_mask, -6
ldr b28, [x3, w17, sxtw] sub x6, x6, w4, uxtw #1
dup v28.8h, v28.h[0]
ld1 {v25.16b, v26.16b, v27.16b}, [x6]
bit v2.16b, v28.16b, v25.16b
bit v3.16b, v28.16b, v26.16b
bit v4.16b, v28.16b, v27.16b
b.le 0f
mov v2.16b, v4.16b
ld1 {v4.16b}, [x3], #16
tst w7, #2 // LR_HAVE_RIGHT
uxtl v3.8h, v4.8b
uxtl2 v4.8h, v4.16b
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
function wiener_filter7_hv_8bpc_neon
// Backing up/restoring registers shifted, so that x9 gets the value
// of x10, etc, and x15==x9, afterwards.
stp x10, x11, [sp, #-80]!
stp x12, x13, [sp, #16]
stp x14, x15, [sp, #32]
stp x10, x0, [sp, #48]
stp x3, x4, [sp, #64]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
// left == NULL sub x3, x3, #3
ld1 {v3.16b}, [x3], #16
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v3.16b}, [x3], #16
ld1 {v2.s}[3], [x2], #4
// Move x3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub x3, x3, #3
ext v3.16b, v2.16b, v3.16b, #13
b 2f
1:
ld1 {v3.16b}, [x3], #16
// !LR_HAVE_LEFT, fill v2 with the leftmost byte
// and shift v3 to have 3x the first byte at the front.
dup v2.16b, v3.b[0]
// Move x3 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub x3, x3, #3
ext v3.16b, v2.16b, v3.16b, #13
// Check whether we need to pad the right edge
cmp w4, #19
b.ge 4f // If w >= 19, all used input pixels are valid
// 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. sub w17, w4, #22
// Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel x6, right_ext_mask, -6
ldr b28, [x3, w17, sxtw] sub x6, x6, w4, uxtw #1
dup v28.8h, v28.h[0]
ld1 {v25.16b, v26.16b, v27.16b}, [x6]
bit v2.16b, v28.16b, v25.16b
bit v3.16b, v28.16b, v26.16b
bit v4.16b, v28.16b, v27.16b
b.le 0f
mov v2.16b, v4.16b
ld1 {v4.16b}, [x3], #16
tst w7, #2 // LR_HAVE_RIGHT
uxtl v3.8h, v4.8b
uxtl2 v4.8h, v4.16b
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
// left == NULL sub x3, x3, #2
ld1 {v3.16b}, [x3], #16
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v3.16b}, [x3], #16
ld1 {v2.s}[3], [x2], #4
// Move x3 back to account for the last 2 bytes we loaded earlier,
// which we'll shift out. sub x3, x3, #2
ext v3.16b, v2.16b, v3.16b, #14
b 2f
1:
ld1 {v3.16b}, [x3], #16
// !LR_HAVE_LEFT, fill v2 with the leftmost byte
// and shift v3 to have 3x the first byte at the front.
dup v2.16b, v3.b[0]
// Move x3 back to account for the last 2 bytes we loaded before,
// which we shifted out. sub x3, x3, #2
ext v3.16b, v2.16b, v3.16b, #14
// Check whether we need to pad the right edge
cmp w4, #18
b.ge 4f // If w >= 18, all used input pixels are valid
// 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. sub w17, w4, #23
// Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
// buffer pointer.
movrel x6, right_ext_mask, -4
ldr b28, [x3, w17, sxtw] sub x6, x6, w4, uxtw #1
dup v28.8h, v28.h[0]
ld1 {v25.16b, v26.16b, v27.16b}, [x6]
bit v2.16b, v28.16b, v25.16b
bit v3.16b, v28.16b, v26.16b
bit v4.16b, v28.16b, v27.16b
b.le 0f
mov v2.16b, v4.16b
ld1 {v4.16b}, [x3], #16
tst w7, #2 // LR_HAVE_RIGHT
uxtl v3.8h, v4.8b
uxtl2 v4.8h, v4.16b
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
function wiener_filter5_hv_8bpc_neon
// Backing up/restoring registers shifted, so that x11 gets the value
// of x12, etc, and x15==x11, afterwards.
stp x12, x13, [sp, #-64]!
stp x14, x15, [sp, #16]
stp x12, x0, [sp, #32]
stp x3, x4, [sp, #48]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
// left == NULL sub x3, x3, #2
ld1 {v3.16b}, [x3], #16
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v3.16b}, [x3], #16
ld1 {v2.s}[3], [x2], #4
// Move x3 back to account for the last 2 bytes we loaded earlier,
// which we'll shift out. sub x3, x3, #2
ext v3.16b, v2.16b, v3.16b, #14
b 2f
1:
ld1 {v3.16b}, [x3], #16
// !LR_HAVE_LEFT, fill v2 with the leftmost byte
// and shift v3 to have 2x the first byte at the front.
dup v2.16b, v3.b[0]
// Move x3 back to account for the last 2 bytes we loaded before,
// which we shifted out. sub x3, x3, #2
ext v3.16b, v2.16b, v3.16b, #14
// Check whether we need to pad the right edge
cmp w4, #18
b.ge 4f // If w >= 18, all used input pixels are valid
// 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. sub w17, w4, #23
// Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
// buffer pointer.
movrel x6, right_ext_mask, -4
ldr b28, [x3, w17, sxtw] sub x6, x6, w4, uxtw #1
dup v28.8h, v28.h[0]
ld1 {v25.16b, v26.16b, v27.16b}, [x6]
bit v2.16b, v28.16b, v25.16b
bit v3.16b, v28.16b, v26.16b
bit v4.16b, v28.16b, v27.16b
b.le 0f
mov v2.16b, v4.16b
ld1 {v4.16b}, [x3], #16
tst w7, #2 // LR_HAVE_RIGHT
uxtl v3.8h, v4.8b
uxtl2 v4.8h, v4.16b
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
// LR_HAVE_LEFT && left == NULL sub x3, x3, #2
ld1 {v0.16b}, [x3], #16
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v0.16b}, [x3], #16
ld1 {v1.s}[3], [x2]
// Move x3 back to account for the last 2 bytes we loaded earlier,
// which we'll shift out. sub x3, x3, #2
ext v0.16b, v1.16b, v0.16b, #14
b 2f
1:
ld1 {v0.16b}, [x3], #16
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
// and shift v0 to have 2x the first byte at the front.
dup v1.16b, v0.b[0]
// Move x3 back to account for the last 2 bytes we loaded before,
// which we shifted out. sub x3, x3, #2
ext v0.16b, v1.16b, v0.16b, #14
tst w5, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here. sub w13, w4, #(2 + 16 - 2 + 1)
ldr b30, [x3, w13, sxtw]
// Fill v30 with the right padding pixel
dup v30.16b, v30.b[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w4, #10
b.ge 4f // If w >= 10, all used input pixels are valid
// 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in v0.b[w] onwards
movrel x13, right_ext_mask sub x13, x13, w4, uxtw
ld1 {v29.16b}, [x13]
// LR_HAVE_LEFT && left == NULL sub x3, x3, #3
ld1 {v0.16b}, [x3], #16
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v0.16b}, [x3], #16
ld1 {v1.s}[3], [x2], #4
// Move x3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub x3, x3, #3
ext v0.16b, v1.16b, v0.16b, #13
b 2f
1:
ld1 {v0.16b}, [x3], #16
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
// and shift v0 to have 3x the first byte at the front.
dup v1.16b, v0.b[0]
// Move x3 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub x3, x3, #3
ext v0.16b, v1.16b, v0.16b, #13
tst w5, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here. sub w13, w4, #(2 + 16 - 3 + 1)
ldr b30, [x3, w13, sxtw]
// Fill v30 with the right padding pixel
dup v30.16b, v30.b[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w4, #11
b.ge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel x13, right_ext_mask, -1 sub x13, x13, w4, uxtw
ld1 {v29.16b}, [x13]
// LR_HAVE_LEFT && left == NULL sub x5, x5, #3
ld1 {v0.16b}, [x5], #16
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v0.16b}, [x5], #16
ld1 {v1.s}[3], [x4], #4
// Move x3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub x5, x5, #3
ext v0.16b, v1.16b, v0.16b, #13
b 2f
1:
ld1 {v0.16b}, [x5], #16
// !LR_HAVE_LEFT, fill v1 with the leftmost byte
// and shift v0 to have 3x the first byte at the front.
dup v1.16b, v0.b[0]
// Move x3 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub x5, x5, #3
ext v0.16b, v1.16b, v0.16b, #13
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here. sub w13, w6, #(2 + 16 - 3 + 1)
ldr b30, [x5, w13, sxtw]
// Fill v30 with the right padding pixel
dup v30.16b, v30.b[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w6, #11
b.ge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel x13, right_ext_mask, -1 sub x13, x13, w6, uxtw
ld1 {v29.16b}, [x13]
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.