/* * Because the alignment of pixel data to cachelines, and even the number of * cachelines per row can vary from row to row, and because of the need to * preload each scanline once and only once, this prefetch strategy treats * each row of pixels independently. When a pixel row is long enough, there * are three distinct phases of prefetch: * * an inner loop section, where each time a cacheline of data is * processed, another cacheline is preloaded (the exact distance ahead is * determined empirically using profiling results from lowlevel-blt-bench) * * a leading section, where enough cachelines are preloaded to ensure no * cachelines escape being preloaded when the inner loop starts * * a trailing section, where a limited number (0 or more) of cachelines * are preloaded to deal with data (if any) that hangs off the end of the * last iteration of the inner loop, plus any trailing bytes that were not * enough to make up one whole iteration of the inner loop * * There are (in general) three distinct code paths, selected between * depending upon how long the pixel row is. If it is long enough that there * is at least one iteration of the inner loop (as described above) then * this is described as the "wide" case. If it is shorter than that, but * there are still enough bytes output that there is at least one 16-byte- * long, 16-byte-aligned write to the destination (the optimum type of * write), then this is the "medium" case. If it is not even this long, then * this is the "narrow" case, and there is no attempt to align writes to * 16-byte boundaries. In the "medium" and "narrow" cases, all the * cachelines containing data from the pixel row are prefetched up-front.
*/
/* * Determine whether we put the arguments on the stack for debugging.
*/ #undef DEBUG_PARAMS
/* * Bit flags for 'generate_composite_function' macro which are used * to tune generated functions behavior.
*/
.set FLAG_DST_WRITEONLY, 0
.set FLAG_DST_READWRITE, 1
.set FLAG_COND_EXEC, 0
.set FLAG_BRANCH_OVER, 2
.set FLAG_PROCESS_PRESERVES_PSR, 0
.set FLAG_PROCESS_CORRUPTS_PSR, 4
.set FLAG_PROCESS_DOESNT_STORE, 0
.set FLAG_PROCESS_DOES_STORE, 8 /* usually because it needs to conditionally skip it */
.set FLAG_NO_SPILL_LINE_VARS, 0
.set FLAG_SPILL_LINE_VARS_WIDE, 16
.set FLAG_SPILL_LINE_VARS_NON_WIDE, 32
.set FLAG_SPILL_LINE_VARS, 48
.set FLAG_PROCESS_CORRUPTS_SCRATCH, 0
.set FLAG_PROCESS_PRESERVES_SCRATCH, 64
.set FLAG_PROCESS_PRESERVES_WK0, 0
.set FLAG_PROCESS_CORRUPTS_WK0, 128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
.set FLAG_PRELOAD_DST, 0
.set FLAG_NO_PRELOAD_DST, 256
/* * Number of bytes by which to adjust preload offset of destination * buffer (allows preload instruction to be moved before the load(s))
*/
.set DST_PRELOAD_BIAS, 0
/* * Offset into stack where mask and source pointer/stride can be accessed.
*/ #ifdef DEBUG_PARAMS
.set ARGS_STACK_OFFSET, (9*4+9*4) #else
.set ARGS_STACK_OFFSET, (9*4) #endif
/* * Offset into stack where space allocated during init macro can be accessed.
*/
.set LOCALS_STACK_OFFSET, 0
.macro preload_leading_step1 bpp, ptr, base /* If the destination is already 16-byte aligned, then we need to preload * between 0 and prefetch_distance (inclusive) cache lines ahead so there * are no gaps when the inner loop starts.
*/
.if \bpp > 0
PF bic, \ptr, \base, #31
.set OFFSET, 0
.rept prefetch_distance+1
PF pld, [\ptr, #OFFSET]
.set OFFSET, OFFSET+32
.endr
.endif
.endm
.macro preload_leading_step2 bpp, bpp_shift, ptr, base /* However, if the destination is not 16-byte aligned, we may need to * preload more cache lines than that. The question we need to ask is: * are the bytes corresponding to the leading pixels more than the amount * by which the source pointer will be rounded down for preloading, and if * so, by how many cache lines? Effectively, we want to calculate * leading_bytes = ((-dst)&15)*src_bpp/dst_bpp * inner_loop_offset = (src+leading_bytes)&31 * extra_needed = leading_bytes - inner_loop_offset * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only * possible when there are 4 src bytes for every 1 dst byte).
*/
.if \bpp > 0
.ifc \base,DST /* The test can be simplified further when preloading the destination */
PF tst, \base, #16
PF beq, 61f
.else
.if \bpp/dst_w_bpp == 4
PF add, SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
PF and, SCRATCH, SCRATCH, #31
PF rsb, SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
PF sub, SCRATCH, SCRATCH, #1/* so now ranges are -16..-1 / 0..31 / 32..63 */
PF movs, SCRATCH, SCRATCH, lsl #32-6 /* so this sets NC / nc / Nc */
PF bcs, 61f
PF bpl, 60f
PF pld, [ptr, #32*(prefetch_distance+2)]
.else
PF mov, SCRATCH, \base, lsl #32-5
PF add, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
PF rsbs, SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
PF bls, 61f
.endif
.endif
60: PF pld, [\ptr, #32*(prefetch_distance+1)]
61:
.endif
.endm
.macro preload_trailing bpp, bpp_shift, base
.if \bpp > 0
.if \bpp*pix_per_block > 256 /* Calculations are more complex if more than one fetch per block */
PF and, WK1, \base, #31
PF add, WK1, WK1, WK0, lsl #\bpp_shift
PF add, WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
PF bic, SCRATCH, \base, #31
80: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
PF add, SCRATCH, SCRATCH, #32
PF subs, WK1, WK1, #32
PF bhi, 80b
.else /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
PF mov, SCRATCH, \base, lsl #32-5
PF adds, SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
PF adcseq, SCRATCH, SCRATCH, #0 /* The instruction above has two effects: ensures Z is only * set if C was clear (so Z indicates that both shifted quantities * were 0), and clears C if Z was set (so C indicates that the sum
* of the shifted quantities was greater and not equal to 32) */
PF beq, 82f
PF bic, SCRATCH, \base, #31
PF bcc, 81f
PF pld, [SCRATCH, #32*(prefetch_distance+2)]
81: PF pld, [SCRATCH, #32*(prefetch_distance+1)]
82:
.endif
.endif
.endm
.macro preload_line narrow_case, bpp, bpp_shift, base /* "narrow_case" - just means that the macro was invoked from the "narrow" * code path rather than the "medium" one - because in the narrow case, * the row of pixels is known to output no more than 30 bytes, then * (assuming the source pixels are no wider than the the destination * pixels) they cannot possibly straddle more than 2 32-byte cachelines, * meaning there's no need for a loop. * "bpp" - number of bits per pixel in the channel (source, mask or * destination) that's being preloaded, or 0 if this channel is not used * for reading * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course) * "base" - base address register of channel to preload (SRC, MASK or DST)
*/
.if \bpp > 0
.if \narrow_case && (\bpp <= dst_w_bpp) /* In these cases, each line for each channel is in either 1 or 2 cache lines */
PF bic, WK0, \base, #31
PF pld, [WK0]
PF add, WK1, \base, X, LSL #\bpp_shift
PF sub, WK1, WK1, #1
PF bic, WK1, WK1, #31
PF cmp, WK1, WK0
PF beq, 90f
PF pld, [WK1]
90:
.else
PF bic, WK0, \base, #31
PF pld, [WK0]
PF add, WK1, \base, X, lsl #\bpp_shift
PF sub, WK1, WK1, #1
PF bic, WK1, WK1, #31
PF cmp, WK1, WK0
PF beq, 92f
91: PF add, WK0, WK0, #32
PF cmp, WK0, WK1
PF pld, [WK0]
PF bne, 91b
92:
.endif
.endif
.endm
.macro wide_case_inner_loop process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
110:
.set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
.rept pix_per_block*dst_w_bpp/128
\process_head , 16, 0, \unaligned_src, \unaligned_mask, 1
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
preload_middle src_bpp, SRC, 1
.elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
preload_middle mask_bpp, MASK, 1
.else
preload_middle src_bpp, SRC, 0
preload_middle mask_bpp, MASK, 0
.endif
.if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0) /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that * destination prefetches are 32-byte aligned. It's also the easiest channel to offset * preloads for, to achieve staggered prefetches for multiple channels, because there are * always two STMs per prefetch, so there is always an opposite STM on which to put the
* preload. Note, no need to BIC the base register here */
PF pld, [DST, #32*prefetch_distance - \dst_alignment]
.endif
\process_tail , 16, 0
.if !((flags) & FLAG_PROCESS_DOES_STORE)
pixst , 16, 0, DST
.endif
.set SUBBLOCK, SUBBLOCK+1
.endr
subs X, X, #pix_per_block
bhs 110b
.endm
.macro wide_case_inner_loop_and_trailing_pixels process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
.if dst_r_bpp > 0
tst DST, #16
bne 111f
\process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
b 112f
111:
.endif
\process_inner_loop \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
112: /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
.if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
PF and, WK0, X, #pix_per_block-1
.endif
preload_trailing src_bpp, src_bpp_shift, SRC
preload_trailing mask_bpp, mask_bpp_shift, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_trailing dst_r_bpp, dst_bpp_shift, DST
.endif
add X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp /* The remainder of the line is handled identically to the medium case */
medium_case_inner_loop_and_trailing_pixels \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
.endm
/* The standard entry conditions set up by pixman-arm-common.h are: * r0 = width (pixels) * r1 = height (rows) * r2 = pointer to top-left pixel of destination * r3 = destination stride (pixels) * [sp] = source pixel value, or pointer to top-left pixel of source * [sp,#4] = 0 or source stride (pixels) * The following arguments are unused for non-mask operations * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask * [sp,#12] = 0 or mask stride (pixels)
*/
/* * Assign symbolic names to registers
*/
X .req r0 /* pixels to go on this line */
Y .req r1 /* lines to go */
DST .req r2 /* destination pixel pointer */
STRIDE_D .req r3 /* destination stride (bytes, minus width) */
SRC .req r4 /* source pixel pointer */
STRIDE_S .req r5 /* source stride (bytes, minus width) */
MASK .req r6 /* mask pixel pointer (if applicable) */
STRIDE_M .req r7 /* mask stride (bytes, minus width) */
WK0 .req r8 /* pixel data registers */
WK1 .req r9
WK2 .req r10
WK3 .req r11
SCRATCH .req r12
ORIG_W .req r14 /* width (pixels) */
#ifdef DEBUG_PARAMS
add Y, Y, #1
stmia sp, {r0-r7,pc}
sub Y, Y, #1 #endif
\init
.if (flags) & FLAG_PROCESS_CORRUPTS_WK0 /* Reserve a word in which to store X during leading pixels */
sub sp, sp, #4
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
.endif
/* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
cmp X, #2*16*8/dst_w_bpp - 1
blo 170f
.if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */ /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
cmp X, #(prefetch_distance+3)*pix_per_block - 1
blo 160f
/* Wide case */ /* Adjust X so that the decrement instruction can also test for * inner loop termination. We want it to stop when there are
* (prefetch_distance+1) complete blocks to go. */
sub X, X, #(prefetch_distance+2)*pix_per_block
mov ORIG_W, X
.if (flags) & FLAG_SPILL_LINE_VARS_WIDE /* This is stmdb sp!,{} */
.word 0xE92D0000 | LINE_SAVED_REGS
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
151: /* New line */
\newline
preload_leading_step1 src_bpp, WK1, SRC
preload_leading_step1 mask_bpp, WK2, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_leading_step1 dst_r_bpp, WK3, DST
.endif
ands WK0, DST, #15
beq 154f
rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
154: /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
.if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) and SCRATCH, SRC, #31
rsb SCRATCH, SCRATCH, #32*prefetch_distance
.elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH) and SCRATCH, MASK, #31
rsb SCRATCH, SCRATCH, #32*prefetch_distance
.endif
.ifc "\process_inner_loop",""
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
.else
switch_on_alignment wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
.endif
157: /* Check for another line */
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
.if (flags) & FLAG_SPILL_LINE_VARS_WIDE
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
.endif
.endif
.ltorg
160: /* Medium case */
mov ORIG_W, X
.if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE /* This is stmdb sp!,{} */
.word 0xE92D0000 | LINE_SAVED_REGS
.set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
.endif
161: /* New line */
\newline
preload_line 0, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 0, mask_bpp, mask_bpp_shift, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_line 0, dst_r_bpp, dst_bpp_shift, DST
.endif
sub X, X, #128/dst_w_bpp /* simplifies inner loop termination */
ands WK0, DST, #15
beq 164f
rsb WK0, WK0, #16 /* number of leading bytes until destination aligned */
leading_15bytes \process_head, \process_tail
164: /* Destination now 16-byte aligned; we have at least one 16-byte output block */
switch_on_alignment medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
167: /* Check for another line */
end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
.ltorg
170: /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
.if dst_w_bpp < 32
mov ORIG_W, X
.endif
.if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE /* This is stmdb sp!,{} */
.word 0xE92D0000 | LINE_SAVED_REGS
.endif
171: /* New line */
\newline
preload_line 1, src_bpp, src_bpp_shift, SRC /* in: X, corrupts: WK0-WK1 */
preload_line 1, mask_bpp, mask_bpp_shift, MASK
.if ((flags) & FLAG_NO_PRELOAD_DST) == 0
preload_line 1, dst_r_bpp, dst_bpp_shift, DST
.endif
174: /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
switch_on_alignment narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.