/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved.
*/
/* * Description * * library function for memcpy where length bytes are copied from * ptr_in to ptr_out. ptr_out is returned unchanged. * Allows any combination of alignment on input and output pointers * and length from 0 to 2^32-1 * * Restrictions * The arrays should not overlap, the program will produce undefined output * if they do. * For blocks less than 16 bytes a byte by byte copy is performed. For * 8byte alignments, and length multiples, a dword copy is performed up to * 96bytes * History * * DJH 5/15/09 Initial version 1.0 * DJH 6/ 1/09 Version 1.1 modified ABI to inlcude R16-R19 * DJH 7/12/09 Version 1.2 optimized codesize down to 760 was 840 * DJH 10/14/09 Version 1.3 added special loop for aligned case, was * overreading bloated codesize back up to 892 * DJH 4/20/10 Version 1.4 fixed Ldword_loop_epilog loop to prevent loads * occurring if only 1 left outstanding, fixes bug * # 3888, corrected for all alignments. Peeled off * 1 32byte chunk from kernel loop and extended 8byte * loop at end to solve all combinations and prevent * over read. Fixed Ldword_loop_prolog to prevent * overread for blocks less than 48bytes. Reduced * codesize to 752 bytes * DJH 4/21/10 version 1.5 1.4 fix broke code for input block ends not * aligned to dword boundaries,underwriting by 1 * byte, added detection for this and fixed. A * little bloat. * DJH 4/23/10 version 1.6 corrected stack error, R20 was not being restored * always, fixed the error of R20 being modified * before it was being saved * Natural c model * =============== * void * memcpy(char * ptr_out, char * ptr_in, int length) { * int i; * if(length) for(i=0; i < length; i++) { ptr_out[i] = ptr_in[i]; } * return(ptr_out); * } * * Optimized memcpy function * ========================= * void * memcpy(char * ptr_out, char * ptr_in, int len) { * int i, prolog, kernel, epilog, mask; * u8 offset; * s64 data0, dataF8, data70; * * s64 * ptr8_in; * s64 * ptr8_out; * s32 * ptr4; * s16 * ptr2; * * offset = ((int) ptr_in) & 7; * ptr8_in = (s64 *) &ptr_in[-offset]; //read in the aligned pointers * * data70 = *ptr8_in++; * dataF8 = *ptr8_in++; * * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); * * prolog = 32 - ((int) ptr_out); * mask = 0x7fffffff >> HEXAGON_R_cl0_R(len); * prolog = prolog & mask; * kernel = len - prolog; * epilog = kernel & 0x1F; * kernel = kernel>>5; * * if (prolog & 1) { ptr_out[0] = (u8) data0; data0 >>= 8; ptr_out += 1;} * ptr2 = (s16 *) &ptr_out[0]; * if (prolog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;} * ptr4 = (s32 *) &ptr_out[0]; * if (prolog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;} * * offset = offset + (prolog & 7); * if (offset >= 8) { * data70 = dataF8; * dataF8 = *ptr8_in++; * } * offset = offset & 0x7; * * prolog = prolog >> 3; * if (prolog) for (i=0; i < prolog; i++) { * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; * data70 = dataF8; * dataF8 = *ptr8_in++; * } * if(kernel) { kernel -= 1; epilog += 32; } * if(kernel) for(i=0; i < kernel; i++) { * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; * data70 = *ptr8_in++; * * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset); * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; * dataF8 = *ptr8_in++; * * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; * data70 = *ptr8_in++; * * data0 = HEXAGON_P_valignb_PPp(data70, dataF8, offset); * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; * dataF8 = *ptr8_in++; * } * epilogdws = epilog >> 3; * if (epilogdws) for (i=0; i < epilogdws; i++) { * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); * ptr8_out = (s64 *) &ptr_out[0]; *ptr8_out = data0; ptr_out += 8; * data70 = dataF8; * dataF8 = *ptr8_in++; * } * data0 = HEXAGON_P_valignb_PPp(dataF8, data70, offset); * * ptr4 = (s32 *) &ptr_out[0]; * if (epilog & 4) { ptr4[0] = (u32) data0; data0 >>= 32; ptr_out += 4;} * ptr2 = (s16 *) &ptr_out[0]; * if (epilog & 2) { ptr2[0] = (u16) data0; data0 >>= 16; ptr_out += 2;} * if (epilog & 1) { *ptr_out++ = (u8) data0; } * * return(ptr_out - length); * } * * Codesize : 784 bytes
*/
#define ptr_out R0 /* destination pounter */
#define ptr_in R1 /* source pointer */
#define len R2 /* length of copy in bytes */
#define data70 R13:12 /* lo 8 bytes of non-aligned transfer */
#define dataF8 R11:10 /* hi 8 bytes of non-aligned transfer */
#define ldata0 R7:6 /* even 8 bytes chunks */
#define ldata1 R25:24 /* odd 8 bytes chunks */
#define data1 R7 /* lower 8 bytes of ldata1 */
#define data0 R6 /* lower 8 bytes of ldata0 */
#define ifbyte p0 /* if transfer has bytes in epilog/prolog */
#define ifhword p0 /* if transfer has shorts in epilog/prolog */
#define ifword p0 /* if transfer has words in epilog/prolog */
#define noprolog p0 /* no prolog, xfer starts at 32byte */
#define nokernel p1 /* no 32byte multiple block in the transfer */
#define noepilog p0 /* no epilog, xfer ends on 32byte boundary */
#define align p2 /* alignment of input rel to 8byte boundary */
#define kernel1 p0 /* kernel count == 1 */
#define dalign R25 /* rel alignment of input to output data */
#define star3 R16 /* number bytes in prolog - dwords */
#define rest R8 /* length - prolog bytes */
#define back R7 /* nr bytes > dword boundary in src block */
#define epilog R3 /* bytes in epilog */
#define inc R15:14 /* inc kernel by -1 and defetch ptr by 32 */
#define kernel R4 /* number of 32byte chunks in kernel */
#define ptr_in_p_128 R5 /* pointer for prefetch of input data */
#define mask R8 /* mask used to determine prolog size */
#define shift R8 /* used to work a shifter to extract bytes */
#define shift2 R5 /* in epilog to workshifter to extract bytes */
#define prolog R15 /* bytes in prolog */
#define epilogdws R15 /* number dwords in epilog */
#define shiftb R14 /* used to extract bytes */
#define offset R9 /* same as align in reg */
#define ptr_out_p_32 R17 /* pointer to output dczero */
#define align888 R14 /* if simple dword loop can be used */
#define len8 R9 /* number of dwords in length */
#define over R20 /* nr of bytes > last inp buf dword boundary */
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.