/* * This file contains a macro ('generate_composite_function') which can * construct 2D image processing functions, based on a common template. * Any combinations of source, destination and mask images with 8bpp, * 16bpp, 24bpp, 32bpp color formats are supported. * * This macro takes care of: * - handling of leading and trailing unaligned pixels * - doing most of the work related to L2 cache preload * - encourages the use of software pipelining for better instructions * scheduling * * The user of this macro has to provide some configuration parameters * (bit depths for the images, prefetch distance, etc.) and a set of * macros, which should implement basic code chunks responsible for * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage * examples. * * TODO: * - try overlapped pixel method (from Ian Rickards) when processing * exactly two blocks of pixels * - maybe add an option to do reverse scanline processing
*/
/* * Bit flags for 'generate_composite_function' macro which are used * to tune generated functions behavior.
*/
.set FLAG_DST_WRITEONLY, 0
.set FLAG_DST_READWRITE, 1
.set FLAG_DEINTERLEAVE_32BPP, 2
/* * Offset in stack where mask and source pointer/stride can be accessed * from 'init' macro. This is useful for doing special handling for solid mask.
*/
.set ARGS_STACK_OFFSET, 40
/* * Constants for selecting preferable prefetch type.
*/
.set PREFETCH_TYPE_NONE, 0 /* No prefetch at all */
.set PREFETCH_TYPE_SIMPLE, 1 /* A simple, fixed-distance-ahead prefetch */
.set PREFETCH_TYPE_ADVANCED, 2 /* Advanced fine-grained prefetch */
/* * Definitions of supplementary pixld/pixst macros (for partial load/store of * pixel data).
*/
/* * This is a macro for implementing cache preload. The main idea is that * cache preload logic is mostly independent from the rest of pixels * processing code. It starts at the top left pixel and moves forward * across pixels and can jump across scanlines. Prefetch distance is * handled in an 'incremental' way: it starts from 0 and advances to the * optimal distance over time. After reaching optimal prefetch distance, * it is kept constant. There are some checks which prevent prefetching * unneeded pixel lines below the image (but it still can prefetch a bit * more data on the right side of the image - not a big issue and may * be actually helpful when rendering text glyphs). Additional trick is * the use of LDR instruction for prefetch instead of PLD when moving to * the next line, the point is that we have a high chance of getting TLB * miss in this case, and PLD would be useless. * * This sounds like it may introduce a noticeable overhead (when working with * fully cached data). But in reality, due to having a separate pipeline and * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can * execute simultaneously with NEON and be completely shadowed by it. Thus * we get no performance overhead at all (*). This looks like a very nice * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher, * but still can implement some rather advanced prefetch logic in software * for almost zero cost! * * (*) The overhead of the prefetcher is visible when running some trivial * pixels processing like simple copy. Anyway, having prefetch is a must * when working with the graphics data.
*/
.macro PF a, x:vararg
.if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
\a \x
.endif
.endm
/* * Macro which is used to process leading pixels until destination * pointer is properly aligned (at 16 bytes boundary). When destination * buffer uses 16bpp format, this is unnecessary, or even pointless.
*/
.macro ensure_destination_ptr_alignment process_pixblock_head, \
process_pixblock_tail, \
process_pixblock_tail_head
.if dst_w_bpp != 24
tst DST_R, #0xF
beq 2f
/* * Special code for processing up to (pixblock_size - 1) remaining * trailing pixels. As SIMD processing performs operation on * pixblock_size pixels, anything smaller than this has to be loaded * and stored in a special way. Loading and storing of pixel data is * performed in such a way that we fill some 'slots' in the NEON * registers (some slots naturally are unused), then perform compositing * operation as usual. In the end, the data is taken from these 'slots' * and saved to memory. * * cache_preload_flag - allows to suppress prefetch if * set to 0 * dst_aligned_flag - selects whether destination buffer * is aligned
*/
.macro process_trailing_pixels cache_preload_flag, \
dst_aligned_flag, \
process_pixblock_head, \
process_pixblock_tail, \
process_pixblock_tail_head
tst W, #(pixblock_size - 1)
beq 2f
.irp chunk_size, 16, 8, 4, 2, 1
.if pixblock_size > \chunk_size
tst W, #\chunk_size
beq 1f
pixld_src \chunk_size, src_bpp, src_basereg, SRC
pixld \chunk_size, mask_bpp, mask_basereg, MASK
.if \dst_aligned_flag != 0
pixld_a \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
.else
pixld \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
.endif
.if \cache_preload_flag != 0
PF add, PF_X, PF_X, #\chunk_size
.endif
1:
.endif
.endr
pixdeinterleave src_bpp, src_basereg
pixdeinterleave mask_bpp, mask_basereg
pixdeinterleave dst_r_bpp, dst_r_basereg
/* * Macro, which performs all the needed operations to switch to the next * scanline and start the next loop iteration unless all the scanlines * are already processed.
*/
.macro advance_to_next_scanline start_of_loop_label
.if regs_shortage
ldrd W, [sp] /* load W and H (width and height) from stack */
.else
mov W, ORIG_W
.endif
add DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
.if src_bpp != 0
add SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
.endif
.if mask_bpp != 0
add MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
.endif
.if (dst_w_bpp != 24)
sub DST_W, DST_W, W, lsl #dst_bpp_shift
.endif
.if (src_bpp != 24) && (src_bpp != 0)
sub SRC, SRC, W, lsl #src_bpp_shift
.endif
.if (mask_bpp != 24) && (mask_bpp != 0)
sub MASK, MASK, W, lsl #mask_bpp_shift
.endif
subs H, H, #1
mov DST_R, DST_W
.if regs_shortage
str H, [sp, #4] /* save updated height to stack */
.endif
bge \start_of_loop_label
.endm
/* * Registers are allocated in the following way by default: * d0, d1, d2, d3 - reserved for loading source pixel data * d4, d5, d6, d7 - reserved for loading destination pixel data * d24, d25, d26, d27 - reserved for loading mask pixel data * d28, d29, d30, d31 - final destination pixel data for writeback to memory
*/
.macro generate_composite_function fname, \
src_bpp_, \
mask_bpp_, \
dst_w_bpp_, \
flags, \
pixblock_size_, \
prefetch_distance, \
init, \
cleanup, \
process_pixblock_head, \
process_pixblock_tail, \
process_pixblock_tail_head, \
dst_w_basereg_ = 28, \
dst_r_basereg_ = 4, \
src_basereg_ = 0, \
mask_basereg_ = 24
pixman_asm_function \fname
push {r4-r12, lr} /* save all registers */
/* * Select prefetch type for this function. If prefetch distance is * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch * has to be used instead of ADVANCED.
*/
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
.if \prefetch_distance == 0
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
.elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
.set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
.endif
/* * Make some macro arguments globally visible and accessible * from other macros
*/
.set src_bpp, \src_bpp_
.set mask_bpp, \mask_bpp_
.set dst_w_bpp, \dst_w_bpp_
.set pixblock_size, \pixblock_size_
.set dst_w_basereg, \dst_w_basereg_
.set dst_r_basereg, \dst_r_basereg_
.set src_basereg, \src_basereg_
.set mask_basereg, \mask_basereg_
PF_CTL .req r9 /* combined lines counter and prefetch */ /* distance increment counter */
PF_X .req r10 /* pixel index in a scanline for current */ /* pretetch position */
PF_SRC .req r11 /* pointer to source scanline start */ /* for prefetch purposes */
PF_DST .req r12 /* pointer to destination scanline start */ /* for prefetch purposes */
PF_MASK .req r14 /* pointer to mask scanline start */ /* for prefetch purposes */ /* * Check whether we have enough registers for all the local variables. * If we don't have enough registers, original width and height are * kept on top of stack (and 'regs_shortage' variable is set to indicate * this for the rest of code). Even if there are enough registers, the * allocation scheme may be a bit different depending on whether source * or mask is not used.
*/
.if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
ORIG_W .req r10 /* saved original width */
DUMMY .req r12 /* temporary register */
.set regs_shortage, 0
.elseif mask_bpp == 0
ORIG_W .req r7 /* saved original width */
DUMMY .req r8 /* temporary register */
.set regs_shortage, 0
.elseif src_bpp == 0
ORIG_W .req r4 /* saved original width */
DUMMY .req r5 /* temporary register */
.set regs_shortage, 0
.else
ORIG_W .req r1 /* saved original width */
DUMMY .req r1 /* temporary register */
.set regs_shortage, 1
.endif
.if src_bpp == 24
sub SRC_STRIDE, SRC_STRIDE, W
sub SRC_STRIDE, SRC_STRIDE, W, lsl #1
.endif
.if mask_bpp == 24
sub MASK_STRIDE, MASK_STRIDE, W
sub MASK_STRIDE, MASK_STRIDE, W, lsl #1
.endif
.if dst_w_bpp == 24
sub DST_STRIDE, DST_STRIDE, W
sub DST_STRIDE, DST_STRIDE, W, lsl #1
.endif
/* Default prologue/epilogue, nothing special needs to be done */
.macro default_init
.endm
.macro default_cleanup
.endm
/* * Prologue/epilogue variant which additionally saves/restores d8-d15 * registers (they need to be saved/restored by callee according to ABI). * This is required if the code needs to use all the NEON registers.
*/
/* * Conversion from planar a8r8g8b8 format (with a, r, g, b color components * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6 * pixels packed in 128-bit register (out). Requires two temporary 128-bit * registers (tmp1, tmp2)
*/
.macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
vshll.u8 \tmp1, \in_g, #8
vshll.u8 \out, \in_r, #8
vshll.u8 \tmp2, \in_b, #8
vsri.u16 \out, \tmp1, #5
vsri.u16 \out, \tmp2, #11
.endm
/* * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels * returned in (out0, out1) registers pair. Requires one temporary * 64-bit register (tmp). 'out1' and 'in' may overlap, the original * value from 'in' is lost
*/
.macro convert_four_0565_to_x888_packed in, out0, out1, tmp
vshl.u16 \out0, \in, #5/* G top 6 bits */
vshl.u16 \tmp, \in, #11 /* B top 5 bits */
vsri.u16 \in, \in, #5/* R is ready in top bits */
vsri.u16 \out0, \out0, #6/* G is ready in top bits */
vsri.u16 \tmp, \tmp, #5/* B is ready in top bits */
vshr.u16 \out1, \in, #8/* R is in place */
vsri.u16 \out0, \tmp, #8/* G & B is in place */
vzip.u16 \out0, \out1 /* everything is in place */
.endm
Messung V0.5
¤ Dauer der Verarbeitung: 0.42 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.