// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r5, #1 // LR_HAVE_LEFT
beq 1f
// LR_HAVE_LEFT
cmp r1, #0
bne 0f
// left == NULL sub r2, r2, #3
vld1.8 {q2}, [r2]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q2}, [r2]!
vld1.32 {d3[1]}, [r1]
// Move r2 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub r2, r2, #3
vext.8 q2, q1, q2, #13
b 2f
1:
vld1.8 {q2}, [r2]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q2 to have 3x the first byte at the front.
vdup.8 q1, d4[0]
// Move r2 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub r2, r2, #3
vext.8 q2, q1, q2, #13
2:
vmovl.u8 q1, d4
vmovl.u8 q2, d5
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
// q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel. sub r12, r4, #14
// Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r3, right_ext_mask, -6
ldrb r12, [r2, r12] sub r3, r3, r4, lsl #1
vdup.16 q13, r12
vld1.8 {q10, q11}, [r3]
ble 9f
vmov q1, q2
vld1.8 {d4}, [r2]!
tst r5, #2 // LR_HAVE_RIGHT
vmovl.u8 q2, d4
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
9:
pop {r4-r5,pc}
endfunc
// void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, int16_t **ptrs,
// const int16_t fv[8], const int w);
function wiener_filter_v_8bpc_neon, export=1
push {r4-r9,lr}
vpush {q4-q6}
// Shift the pointers, but only update the first 5; the 6th pointer is
// kept as it was before (and the 7th is implicitly identical to the
// 6th).
ldrd r4, r5, [r1, #4]
ldrd r6, r7, [r1, #12]
ldr r8, [r1, #20]
strd r4, r5, [r1]
strd r6, r7, [r1, #8]
str r8, [r1, #16]
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
tst r5, #1 // LR_HAVE_LEFT
beq 1f
// LR_HAVE_LEFT
cmp r1, #0
bne 0f
// left == NULL sub r2, r2, #3
vld1.8 {q2}, [r2]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q2}, [r2]!
vld1.32 {d3[1]}, [r1]
// Move r2 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub r2, r2, #3
vext.8 q2, q1, q2, #13
b 2f
1:
vld1.8 {q2}, [r2]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q2 to have 3x the first byte at the front.
vdup.8 q3, d4[0]
// Move r2 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub r2, r2, #3
vext.8 q2, q3, q2, #13
2:
vmovl.u8 q3, d5
vmovl.u8 q2, d4
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
// q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel. sub lr, r4, #14
// Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
// buffer pointer.
movrel_local r3, right_ext_mask, -6
ldrb lr, [r2, lr] sub r3, r3, r4, lsl #1
vdup.16 q13, lr
vld1.8 {q10, q11}, [r3]
ble 9f
vmov q2, q3
vld1.8 {d6}, [r2]!
tst r5, #2 // LR_HAVE_RIGHT
vmovl.u8 q3, d6
bne 4b // If we don't need to pad, just keep filtering.
b 3b // If we need to pad, check how many pixels we have left.
9:
// Reload ptrs from arguments on the stack
ldr lr, [sp, #108]
// Rotate the window of pointers. Shift the 6 pointers downwards one step.
ldrd r6, r7, [lr, #4]
ldrd r8, r9, [lr, #12]
ldrd r10, r11, [lr, #20]
strd r6, r7, [lr]
strd r8, r9, [lr, #8]
strd r10, r11, [lr, #16]
// The topmost pointer, ptrs[6], which isn't used as input, is set to
// ptrs[0], which will be used as output for the next _hv call.
// At the start of the filtering, the caller may set ptrs[6] to the
// right next buffer to fill in, instead.
str r6, [lr, #24]
// LR_HAVE_LEFT && left == NULL sub r3, r3, #2
vld1.8 {q0}, [r3]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0}, [r3]!
vld1.32 {d3[]}, [r2]
// Move r3 back to account for the last 2 bytes we loaded earlier,
// which we'll shift out. sub r3, r3, #2
vext.8 q0, q1, q0, #14
b 2f
1:
vld1.8 {q0}, [r3]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 2x the first byte at the front.
vdup.8 q1, d0[0]
// Move r3 back to account for the last 2 bytes we loaded before,
// which we shifted out. sub r3, r3, #2
vext.8 q0, q1, q0, #14
2:
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here. sub lr, r4, #(2 + 16 - 2 + 1)
ldrb lr, [r3, lr]
// Fill q14 with the right padding pixel
vdup.8 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #10
bge 4f // If w >= 10, all used input pixels are valid
// 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
// again; it's not strictly needed in those cases (we pad enough here),
// but keeping the code as simple as possible.
// Insert padding in q0.b[w] onwards
movrel_local lr, right_ext_mask sub lr, lr, r4
vld1.8 {q13}, [lr]
// LR_HAVE_LEFT && left == NULL sub r3, r3, #3
vld1.8 {q0}, [r3]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0}, [r3]!
vld1.32 {d3[]}, [r2]
// Move r3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub r3, r3, #3
vext.8 q0, q1, q0, #13
b 2f
1:
vld1.8 {q0}, [r3]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 3x the first byte at the front.
vdup.8 q1, d0[0]
// Move r3 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub r3, r3, #3
vext.8 q0, q1, q0, #13
2:
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
tst r5, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here. sub lr, r4, #(2 + 16 - 3 + 1)
ldrb lr, [r3, lr]
// Fill q14 with the right padding pixel
vdup.8 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r4, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -1 sub lr, lr, r4
vld1.8 {q13}, [lr]
ble 9f
tst r5, #2 // LR_HAVE_RIGHT
vld1.8 {d6}, [r3]!
vmov q1, q2
vext.8 q0, q0, q3, #8
vmull.u8 q2, d6, d6
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
// LR_HAVE_LEFT && left == NULL sub r5, r5, #3
vld1.8 {q0}, [r5]!
b 2f
0:
// LR_HAVE_LEFT, left != NULL
vld1.8 {q0}, [r5]!
vld1.32 {d3[]}, [r4]
// Move r3 back to account for the last 3 bytes we loaded earlier,
// which we'll shift out. sub r5, r5, #3
vext.8 q0, q1, q0, #13
b 2f
1:
vld1.8 {q0}, [r5]!
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
// and shift q0 to have 3x the first byte at the front.
vdup.8 q1, d0[0]
// Move r3 back to account for the last 3 bytes we loaded before,
// which we shifted out. sub r5, r5, #3
vext.8 q0, q1, q0, #13
2:
vmull.u8 q1, d0, d0
vmull.u8 q2, d1, d1
tst r7, #2 // LR_HAVE_RIGHT
bne 4f
// If we'll need to pad the right edge, load that byte to pad with
// here since we can find it pretty easily from here. sub lr, r6, #(2 + 16 - 3 + 1)
ldrb lr, [r5, lr]
// Fill q14 with the right padding pixel
vdup.8 q14, lr
3: // !LR_HAVE_RIGHT
// Check whether we need to pad the right edge
cmp r6, #11
bge 4f // If w >= 11, all used input pixels are valid
// 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
// this ends up called again; it's not strictly needed in those
// cases (we pad enough here), but keeping the code as simple as possible.
// Insert padding in q0.b[w+1] onwards; fuse the +1 into the
// buffer pointer.
movrel_local lr, right_ext_mask, -1 sub lr, lr, r6
vld1.8 {q13}, [lr]
ble 9f
tst r7, #2 // LR_HAVE_RIGHT
vld1.8 {d6}, [r5]!
vmov q1, q2
vext.8 q0, q0, q3, #8
vmull.u8 q2, d6, d6
bne 4b // If we don't need to pad, just keep summing.
b 3b // If we need to pad, check how many pixels we have left.
9:
pop {r4-r7,pc}
endfunc
sgr_funcs 8
Messung V0.5
¤ Dauer der Verarbeitung: 0.11 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.