// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
// left == NULL sub x3, x3, #6
ld1 {v2.8h, v3.8h}, [x3], #32
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v2.8h, v3.8h}, [x3], #32
ld1 {v4.d}[1], [x2], #8
// Move x3 back to account for the last 3 pixels we loaded earlier,
// which we'll shift out. sub x3, x3, #6
ext v3.16b, v2.16b, v3.16b, #10
ext v2.16b, v4.16b, v2.16b, #10
b 2f
1:
ld1 {v2.8h, v3.8h}, [x3], #32
// !LR_HAVE_LEFT, fill v4 with the leftmost pixel
// and shift v3 to have 3x the first pixel at the front.
dup v4.8h, v2.h[0]
// Move x3 back to account for the last 3 pixels we loaded before,
// which we shifted out. sub x3, x3, #6
ext v3.16b, v2.16b, v3.16b, #10
ext v2.16b, v4.16b, v2.16b, #10
2:
ld1 {v4.8h}, [x3], #16
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w4, #19
b.ge 4f // If w >= 19, all used input pixels are valid
// 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. sub w17, w4, #22
// Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel x6, right_ext_mask, -6
ldr h26, [x3, w17, sxtw #1] sub x6, x6, w4, uxtw #1
dup v26.8h, v26.h[0]
ld1 {v23.16b, v24.16b, v25.16b}, [x6]
bit v2.16b, v26.16b, v23.16b
bit v3.16b, v26.16b, v24.16b
bit v4.16b, v26.16b, v25.16b
b.le 0f
mov v2.16b, v4.16b
tst w7, #2 // LR_HAVE_RIGHT
ld1 {v3.8h, v4.8h}, [x3], #32
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
function wiener_filter7_hv_16bpc_neon
// Backing up/restoring registers shifted, so that x9 gets the value
// of x10, etc, and x15==x9, afterwards.
stp x10, x11, [sp, #-80]!
stp x12, x13, [sp, #16]
stp x14, x15, [sp, #32]
stp x10, x0, [sp, #48]
stp x3, x4, [sp, #64]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
// left == NULL sub x3, x3, #6
ld1 {v2.8h, v3.8h}, [x3], #32
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v2.8h, v3.8h}, [x3], #32
ld1 {v4.d}[1], [x2], #8
// Move x3 back to account for the last 3 pixels we loaded earlier,
// which we'll shift out. sub x3, x3, #6
ext v3.16b, v2.16b, v3.16b, #10
ext v2.16b, v4.16b, v2.16b, #10
b 2f
1:
ld1 {v2.8h, v3.8h}, [x3], #32
// !LR_HAVE_LEFT, fill v4 with the leftmost pixel
// and shift v3 to have 3x the first pixel at the front.
dup v4.8h, v2.h[0]
// Move x3 back to account for the last 3 pixels we loaded before,
// which we shifted out. sub x3, x3, #6
ext v3.16b, v2.16b, v3.16b, #10
ext v2.16b, v4.16b, v2.16b, #10
2:
ld1 {v4.8h}, [x3], #16
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w4, #19
b.ge 4f // If w >= 19, all used input pixels are valid
// 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel. sub w17, w4, #22
// Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel x6, right_ext_mask, -6
ldr h26, [x3, w17, sxtw #1] sub x6, x6, w4, uxtw #1
dup v26.8h, v26.h[0]
ld1 {v23.16b, v24.16b, v25.16b}, [x6]
bit v2.16b, v26.16b, v23.16b
bit v3.16b, v26.16b, v24.16b
bit v4.16b, v26.16b, v25.16b
b.le 0f
mov v2.16b, v4.16b
tst w7, #2 // LR_HAVE_RIGHT
ld1 {v3.8h, v4.8h}, [x3], #32
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
// left == NULL sub x3, x3, #4
ld1 {v2.8h, v3.8h}, [x3], #32
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v2.8h, v3.8h}, [x3], #32
ld1 {v4.d}[1], [x2], #8
// Move x3 back to account for the last 2 pixels we loaded earlier,
// which we'll shift out. sub x3, x3, #4
ext v3.16b, v2.16b, v3.16b, #12
ext v2.16b, v4.16b, v2.16b, #12
b 2f
1:
ld1 {v2.8h, v3.8h}, [x3], #32
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
// and shift v3 to have 3x the first pixel at the front.
dup v4.8h, v2.h[0]
// Move x3 back to account for the last 2 pixels we loaded before,
// which we shifted out. sub x3, x3, #4
ext v3.16b, v2.16b, v3.16b, #12
ext v2.16b, v4.16b, v2.16b, #12
2:
ld1 {v4.8h}, [x3], #16
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w4, #18
b.ge 4f // If w >= 18, all used input pixels are valid
// 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. sub w17, w4, #23
// Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
// buffer pointer.
movrel x6, right_ext_mask, -4
ldr h26, [x3, w17, sxtw #1] sub x6, x6, w4, uxtw #1
dup v26.8h, v26.h[0]
ld1 {v23.16b, v24.16b, v25.16b}, [x6]
bit v2.16b, v26.16b, v23.16b
bit v3.16b, v26.16b, v24.16b
bit v4.16b, v26.16b, v25.16b
b.le 0f
mov v2.16b, v4.16b
tst w7, #2 // LR_HAVE_RIGHT
ld1 {v3.8h, v4.8h}, [x3], #32
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
function wiener_filter5_hv_16bpc_neon
// Backing up/restoring registers shifted, so that x11 gets the value
// of x12, etc, and x15==x11, afterwards.
stp x12, x13, [sp, #-64]!
stp x14, x15, [sp, #16]
stp x12, x0, [sp, #32]
stp x3, x4, [sp, #48]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst w7, #1 // LR_HAVE_LEFT
b.eq 1f
// LR_HAVE_LEFT
cbnz x2, 0f
// left == NULL sub x3, x3, #4
ld1 {v2.8h, v3.8h}, [x3], #32
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v2.8h, v3.8h}, [x3], #32
ld1 {v4.d}[1], [x2], #8
// Move x3 back to account for the last 2 pixels we loaded earlier,
// which we'll shift out. sub x3, x3, #4
ext v3.16b, v2.16b, v3.16b, #12
ext v2.16b, v4.16b, v2.16b, #12
b 2f
1:
ld1 {v2.8h, v3.8h}, [x3], #32
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
// and shift v3 to have 2x the first pixel at the front.
dup v4.8h, v2.h[0]
// Move x3 back to account for the last 2 pixels we loaded before,
// which we shifted out. sub x3, x3, #4
ext v3.16b, v2.16b, v3.16b, #12
ext v2.16b, v4.16b, v2.16b, #12
2:
ld1 {v4.8h}, [x3], #16
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w4, #18
b.ge 4f // If w >= 18, all used input pixels are valid
// 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
// v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel. sub w17, w4, #23
// Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
// buffer pointer.
movrel x6, right_ext_mask, -4
ldr h26, [x3, w17, sxtw #1] sub x6, x6, w4, uxtw #1
dup v26.8h, v26.h[0]
ld1 {v23.16b, v24.16b, v25.16b}, [x6]
bit v2.16b, v26.16b, v23.16b
bit v3.16b, v26.16b, v24.16b
bit v4.16b, v26.16b, v25.16b
b.le 0f
mov v2.16b, v4.16b
tst w7, #2 // LR_HAVE_RIGHT
ld1 {v3.8h, v4.8h}, [x3], #32
b.ne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
// LR_HAVE_LEFT && left == NULL sub x3, x3, #4
ld1 {v0.8h, v1.8h}, [x3], #32
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v0.8h, v1.8h}, [x3], #32
ld1 {v2.d}[1], [x2]
// Move x3 back to account for the last 2 pixels we loaded earlier,
// which we'll shift out. sub x3, x3, #4
ext v1.16b, v0.16b, v1.16b, #12
ext v0.16b, v2.16b, v0.16b, #12
b 2f
1:
ld1 {v0.8h, v1.8h}, [x3], #32
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
// and shift v0/v1 to have 2x the first pixel at the front.
dup v2.8h, v0.h[0]
// Move x3 back to account for the last 2 pixels we loaded before,
// which we shifted out. sub x3, x3, #4
ext v1.16b, v0.16b, v1.16b, #12
ext v0.16b, v2.16b, v0.16b, #12
2:
tst w5, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here. sub w13, w4, #(2 + 16 - 2 + 1)
ldr h30, [x3, w13, sxtw #1]
// Fill v30 with the right padding pixel
dup v30.8h, v30.h[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w4, #10
b.ge 4f // If w >= 10, all used input pixels are valid
// 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in v0.h[w] onwards
movrel x13, right_ext_mask sub x13, x13, w4, uxtw #1
ld1 {v28.16b, v29.16b}, [x13]
bit v0.16b, v30.16b, v28.16b
bit v1.16b, v30.16b, v29.16b
// LR_HAVE_LEFT && left == NULL sub x3, x3, #6
ld1 {v0.8h, v1.8h}, [x3], #32
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v0.8h, v1.8h}, [x3], #32
ld1 {v2.d}[1], [x2], #8
// Move x3 back to account for the last 3 pixels we loaded earlier,
// which we'll shift out. sub x3, x3, #6
ext v1.16b, v0.16b, v1.16b, #10
ext v0.16b, v2.16b, v0.16b, #10
b 2f
1:
ld1 {v0.8h, v1.8h}, [x3], #32
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
// and shift v0/v1 to have 3x the first pixel at the front.
dup v2.8h, v0.h[0]
// Move x3 back to account for the last 3 pixels we loaded before,
// which we shifted out. sub x3, x3, #6
ext v1.16b, v0.16b, v1.16b, #10
ext v0.16b, v2.16b, v0.16b, #10
2:
tst w5, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here. sub w13, w4, #(2 + 16 - 3 + 1)
ldr h30, [x3, w13, sxtw #1]
// Fill v30 with the right padding pixel
dup v30.8h, v30.h[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w4, #11
b.ge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in v0.h[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel x13, right_ext_mask, -1 sub x13, x13, w4, uxtw #1
ld1 {v28.16b, v29.16b}, [x13]
bit v0.16b, v30.16b, v28.16b
bit v1.16b, v30.16b, v29.16b
// LR_HAVE_LEFT && left == NULL sub x5, x5, #6
ld1 {v0.8h, v1.8h}, [x5], #32
b 2f
0:
// LR_HAVE_LEFT, left != NULL
ld1 {v0.8h, v1.8h}, [x5], #32
ld1 {v2.d}[1], [x4], #8
// Move x3 back to account for the last 3 pixels we loaded earlier,
// which we'll shift out. sub x5, x5, #6
ext v1.16b, v0.16b, v1.16b, #10
ext v0.16b, v2.16b, v0.16b, #10
b 2f
1:
ld1 {v0.8h, v1.8h}, [x5], #32
// !LR_HAVE_LEFT, fill v2 with the leftmost pixel
// and shift v0/v1 to have 3x the first pixel at the front.
dup v2.8h, v0.h[0]
// Move x5 back to account for the last 3 pixels we loaded before,
// which we shifted out. sub x5, x5, #6
ext v1.16b, v0.16b, v1.16b, #10
ext v0.16b, v2.16b, v0.16b, #10
2:
tst w7, #2 // LR_HAVE_RIGHT
b.ne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here. sub w13, w6, #(2 + 16 - 3 + 1)
ldr h30, [x5, w13, sxtw #1]
// Fill v30 with the right padding pixel
dup v30.8h, v30.h[0]
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp w6, #11
b.ge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in v0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel x13, right_ext_mask, -1 sub x13, x13, w6, uxtw #1
ld1 {v28.16b, v29.16b}, [x13]
bit v0.16b, v30.16b, v28.16b
bit v1.16b, v30.16b, v29.16b
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.