// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r5, #1 // LR_HAVE_LEFT
beq 1f
// LR_HAVE_LEFT
cmp r1, #0
bne 0f
// left == NULL sub r2, r2, #6
vld1.16 {q2, q3}, [r2]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.16 {q2, q3}, [r2]!
vld1.16 {d3}, [r1]!
// Move r2 back to account for the last 3 pixels we loaded earlier,
// which we'll shift out. sub r2, r2, #6
vext.8 q3, q2, q3, #10
vext.8 q2, q1, q2, #10
b 2f
1:
vld1.16 {q2, q3}, [r2]!
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
// and shift q2/q3 to have 3x the first pixel at the front.
vdup.16 q1, d4[0]
// Move r2 back to account for the last 3 pixels we loaded before,
// which we shifted out. sub r2, r2, #6
vext.8 q3, q2, q3, #10
vext.8 q2, q1, q2, #10
2:
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
// q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel. sub r12, r4, #14
lsl r12, r12, #1
// Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r3, right_ext_mask, -6
ldrh r12, [r2, r12] sub r3, r3, r4, lsl #1
vdup.16 q11, r12
vld1.8 {q9, q10}, [r3]
ble 9f
vmov q2, q3
tst r5, #2 // LR_HAVE_RIGHT
vld1.16 {q3}, [r2]!
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
9:
pop {r4-r6,pc}
endfunc
// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, int16_t **ptrs,
// const int16_t fv[8], const int w,
// const int bitdepth_max);
function wiener_filter_v_16bpc_neon, export=1
push {r4-r9,lr}
vpush {q4-q7}
ldr lr, [sp, #92] // bitdepth_max
vld1.16 {q0}, [r2, :128]
vdup.16 q2, lr
clz lr, lr sub lr, lr, #11 // round_bits_v
// Shift the pointers, but only update the first 5; the 6th pointer is
// kept as it was before (and the 7th is implicitly identical to the
// 6th).
ldrd r4, r5, [r1, #4]
ldrd r6, r7, [r1, #12]
ldr r8, [r1, #20]
strd r4, r5, [r1]
strd r6, r7, [r1, #8]
str r8, [r1, #16]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r5, #1 // LR_HAVE_LEFT
beq 1f
// LR_HAVE_LEFT
cmp r1, #0
bne 0f
// left == NULL sub r2, r2, #6
vld1.16 {q2, q3}, [r2]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.16 {q2, q3}, [r2]!
vld1.16 {d9}, [r1]!
// Move r2 back to account for the last 3 pixels we loaded earlier,
// which we'll shift out. sub r2, r2, #6
vext.8 q3, q2, q3, #10
vext.8 q2, q4, q2, #10
b 2f
1:
vld1.16 {q2, q3}, [r2]!
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
// and shift q2/q3 to have 3x the first pixel at the front.
vdup.16 q4, d4[0]
// Move r2 back to account for the last 3 pixels we loaded before,
// which we shifted out. sub r2, r2, #6
vext.8 q3, q2, q3, #10
vext.8 q2, q4, q2, #10
2:
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
// q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel. sub lr, r4, #14
lsl lr, lr, #1
// Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r3, right_ext_mask, -6
ldrh lr, [r2, lr] sub r3, r3, r4, lsl #1
vdup.16 q4, lr
vld1.8 {q8, q9}, [r3]
ble 9f
vmov q2, q3
tst r5, #2 // LR_HAVE_RIGHT
vld1.16 {q3}, [r2]!
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
9:
// Reload ptrs from arguments on the stack
ldr lr, [sp, #108]
// Rotate the window of pointers. Shift the 6 pointers downwards one step.
ldrd r6, r7, [lr, #4]
ldrd r8, r9, [lr, #12]
ldrd r10, r11, [lr, #20]
strd r6, r7, [lr]
strd r8, r9, [lr, #8]
strd r10, r11, [lr, #16]
// The topmost pointer, ptrs[6], which isn't used as input, is set to
// ptrs[0], which will be used as output for the next _hv call.
// At the start of the filtering, the caller may set ptrs[6] to the
// right next buffer to fill in, instead.
str r6, [lr, #24]
// LR_HAVE_LEFT && left == NULL sub r3, r3, #4
vld1.8 {q0, q1}, [r3]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0, q1}, [r3]!
vld1.16 {d5}, [r2]
// Move r3 back to account for the last 2 pixels we loaded earlier,
// which we'll shift out. sub r3, r3, #4
vext.8 q1, q0, q1, #12
vext.8 q0, q2, q0, #12
b 2f
1:
vld1.8 {q0, q1}, [r3]!
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
// and shift q0/q1 to have 2x the first pixel at the front.
vdup.16 q2, d0[0]
// Move r3 back to account for the last 2 pixels we loaded before,
// which we shifted out. sub r3, r3, #4
vext.8 q1, q0, q1, #12
vext.8 q0, q2, q0, #12
2:
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here. sub lr, r4, #(2 + 16 - 2 + 1)
lsl lr, lr, #1
ldrh lr, [r3, lr]
// Fill q14 with the right padding pixel
vdup.16 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #10
bge 4f // If w >= 10, all used input pixels are valid
// 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in q0.h[w] onwards
movrel_local lr, right_ext_mask sub lr, lr, r4, lsl #1
vld1.8 {q12, q13}, [lr]
// LR_HAVE_LEFT && left == NULL sub r3, r3, #6
vld1.8 {q0, q1}, [r3]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0, q1}, [r3]!
vld1.16 {d5}, [r2]
// Move r3 back to account for the last 2 pixels we loaded earlier,
// which we'll shift out. sub r3, r3, #6
vext.8 q1, q0, q1, #10
vext.8 q0, q2, q0, #10
b 2f
1:
vld1.8 {q0, q1}, [r3]!
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
// and shift q0/q1 to have 3x the first pixel at the front.
vdup.16 q2, d0[0]
// Move r3 back to account for the last 3 pixels we loaded before,
// which we shifted out. sub r3, r3, #6
vext.8 q1, q0, q1, #10
vext.8 q0, q2, q0, #10
2:
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here. sub lr, r4, #(2 + 16 - 3 + 1)
lsl lr, lr, #1
ldrh lr, [r3, lr]
// Fill q14 with the right padding pixel
vdup.16 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0.h[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -2 sub lr, lr, r4, lsl #1
vld1.8 {q12, q13}, [lr]
ble 9f
tst r5, #2 // LR_HAVE_RIGHT
vmov q0, q1
vld1.16 {q1}, [r3]!
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
// LR_HAVE_LEFT && left == NULL sub r5, r5, #6
vld1.8 {q0, q1}, [r5]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0, q1}, [r5]!
vld1.16 {d5}, [r4]
// Move r3 back to account for the last 2 pixels we loaded earlier,
// which we'll shift out. sub r5, r5, #6
vext.8 q1, q0, q1, #10
vext.8 q0, q2, q0, #10
b 2f
1:
vld1.8 {q0, q1}, [r5]!
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
// and shift q0/q1 to have 3x the first pixel at the front.
vdup.16 q2, d0[0]
// Move r3 back to account for the last 3 pixels we loaded before,
// which we shifted out. sub r5, r5, #6
vext.8 q1, q0, q1, #10
vext.8 q0, q2, q0, #10
2:
tst r7, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that pixel to pad with
// here since we can find it pretty easily from here. sub lr, r6, #(2 + 16 - 3 + 1)
lsl lr, lr, #1
ldrh lr, [r5, lr]
// Fill q14 with the right padding pixel
vdup.16 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r6, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0.h[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -2 sub lr, lr, r6, lsl #1
vld1.8 {q12, q13}, [lr]
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vmov q0, q1
vld1.16 {q1}, [r5]!
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.