// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r5, #1 // LR_HAVE_LEFT
beq 1f
// LR_HAVE_LEFT
cmp r1, #0
bne 0f
// left == NULL sub r2, r2, #3
vld1.8 {q2}, [r2]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q2}, [r2]!
vld1.32 {d3[1]}, [r1]
// Move r2 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub r2, r2, #3
vext.8 q2, q1, q2, #13
b 2f
1:
vld1.8 {q2}, [r2]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q2 to have 3x the first byte at the front.
vdup.8 q1, d4[0]
// Move r2 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub r2, r2, #3
vext.8 q2, q1, q2, #13
2:
vmovl.u8 q1, d4
vmovl.u8 q2, d5
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
// q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel. sub r12, r4, #14
// Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r3, right_ext_mask, -6
ldrb r12, [r2, r12] sub r3, r3, r4, lsl #1
vdup.16 q13, r12
vld1.8 {q10, q11}, [r3]
ble 9f
vmov q1, q2
vld1.8 {d4}, [r2]!
tst r5, #2 // LR_HAVE_RIGHT
vmovl.u8 q2, d4
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
9:
pop {r4-r5,pc}
endfunc
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, int16_t **ptrs,
// const int16_t fv[8], const int w);
function wiener_filter_v_8bpc_neon, export=1
push {r4-r9,lr}
vpush {q4-q6}
// Shift the pointers, but only update the first 5; the 6th pointer is
// kept as it was before (and the 7th is implicitly identical to the
// 6th).
ldrd r4, r5, [r1, #4]
ldrd r6, r7, [r1, #12]
ldr r8, [r1, #20]
strd r4, r5, [r1]
strd r6, r7, [r1, #8]
str r8, [r1, #16]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r5, #1 // LR_HAVE_LEFT
beq 1f
// LR_HAVE_LEFT
cmp r1, #0
bne 0f
// left == NULL sub r2, r2, #3
vld1.8 {q2}, [r2]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q2}, [r2]!
vld1.32 {d3[1]}, [r1]
// Move r2 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub r2, r2, #3
vext.8 q2, q1, q2, #13
b 2f
1:
vld1.8 {q2}, [r2]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q2 to have 3x the first byte at the front.
vdup.8 q3, d4[0]
// Move r2 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub r2, r2, #3
vext.8 q2, q3, q2, #13
2:
vmovl.u8 q3, d5
vmovl.u8 q2, d4
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
// q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel. sub lr, r4, #14
// Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r3, right_ext_mask, -6
ldrb lr, [r2, lr] sub r3, r3, r4, lsl #1
vdup.16 q13, lr
vld1.8 {q10, q11}, [r3]
ble 9f
vmov q2, q3
vld1.8 {d6}, [r2]!
tst r5, #2 // LR_HAVE_RIGHT
vmovl.u8 q3, d6
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
9:
// Reload ptrs from arguments on the stack
ldr lr, [sp, #108]
// Rotate the window of pointers. Shift the 6 pointers downwards one step.
ldrd r6, r7, [lr, #4]
ldrd r8, r9, [lr, #12]
ldrd r10, r11, [lr, #20]
strd r6, r7, [lr]
strd r8, r9, [lr, #8]
strd r10, r11, [lr, #16]
// The topmost pointer, ptrs[6], which isn't used as input, is set to
// ptrs[0], which will be used as output for the next _hv call.
// At the start of the filtering, the caller may set ptrs[6] to the
// right next buffer to fill in, instead.
str r6, [lr, #24]
// LR_HAVE_LEFT && left == NULL sub r3, r3, #2
vld1.8 {q0}, [r3]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0}, [r3]!
vld1.32 {d3[]}, [r2]
// Move r3 back to account for the last 2 bytes we loaded earlier,
// which we'll shift out. sub r3, r3, #2
vext.8 q0, q1, q0, #14
b 2f
1:
vld1.8 {q0}, [r3]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 2x the first byte at the front.
vdup.8 q1, d0[0]
// Move r3 back to account for the last 2 bytes we loaded before,
// which we shifted out. sub r3, r3, #2
vext.8 q0, q1, q0, #14
2:
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here. sub lr, r4, #(2 + 16 - 2 + 1)
ldrb lr, [r3, lr]
// Fill q14 with the right padding pixel
vdup.8 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #10
bge 4f // If w >= 10, all used input pixels are valid
// 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in q0.b[w] onwards
movrel_local lr, right_ext_mask sub lr, lr, r4
vld1.8 {q13}, [lr]
// LR_HAVE_LEFT && left == NULL sub r3, r3, #3
vld1.8 {q0}, [r3]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0}, [r3]!
vld1.32 {d3[]}, [r2]
// Move r3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub r3, r3, #3
vext.8 q0, q1, q0, #13
b 2f
1:
vld1.8 {q0}, [r3]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 3x the first byte at the front.
vdup.8 q1, d0[0]
// Move r3 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub r3, r3, #3
vext.8 q0, q1, q0, #13
2:
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here. sub lr, r4, #(2 + 16 - 3 + 1)
ldrb lr, [r3, lr]
// Fill q14 with the right padding pixel
vdup.8 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -1 sub lr, lr, r4
vld1.8 {q13}, [lr]
ble 9f
tst r5, #2 // LR_HAVE_RIGHT
vld1.8 {d6}, [r3]!
vmov q1, q2
vext.8 q0, q0, q3, #8
vmull.u8 q2, d6, d6
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
// LR_HAVE_LEFT && left == NULL sub r5, r5, #3
vld1.8 {q0}, [r5]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0}, [r5]!
vld1.32 {d3[]}, [r4]
// Move r3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub r5, r5, #3
vext.8 q0, q1, q0, #13
b 2f
1:
vld1.8 {q0}, [r5]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 3x the first byte at the front.
vdup.8 q1, d0[0]
// Move r3 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub r5, r5, #3
vext.8 q0, q1, q0, #13
2:
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
tst r7, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here. sub lr, r6, #(2 + 16 - 3 + 1)
ldrb lr, [r5, lr]
// Fill q14 with the right padding pixel
vdup.8 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r6, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -1 sub lr, lr, r6
vld1.8 {q13}, [lr]
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vld1.8 {d6}, [r5]!
vmov q1, q2
vext.8 q0, q0, q3, #8
vmull.u8 q2, d6, d6
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.