/* * Computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit). * * __csum_partial(r3=buff, r4=len, r5=sum)
*/
_GLOBAL(__csum_partial)
addic r0,r5,0 /* clear carry */
srdi. r6,r4,3 /* less than 8 bytes? */
beq .Lcsum_tail_word
/* * If only halfword aligned, align to a double word. Since odd * aligned addresses should be rare and they would require more * work to calculate the correct checksum, we ignore that case * and take the potential slowdown of unaligned loads.
*/
rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
beq .Lcsum_aligned
.Lcsum_aligned: /* * We unroll the loop such that each iteration is 64 bytes with an * entry and exit limb of 64 bytes, meaning a minimum size of * 128 bytes.
*/
srdi. r6,r4,7
beq .Lcsum_tail_doublewords /* len < 128 */
/* * On POWER6 and POWER7 back to back adde instructions take 2 cycles * because of the XER dependency. This means the fastest this loop can * go is 16 cycles per iteration. The scheduling of the loop below has * been shown to hit this on both POWER6 and POWER7.
*/
.align 5
2:
adde r0,r0,r6 ld r12,32(r3) ld r14,40(r3)
.Lcsum_tail_byte: /* Up to 1 byte to go */
andi. r6,r4,1
beq .Lcsum_finish
lbz r6,0(r3)
#ifdef __BIG_ENDIAN__
sldi r9,r6,8 /* Pad the byte out to 16 bits */
adde r0,r0,r9
#else
adde r0,r0,r6
#endif
.Lcsum_finish:
addze r0,r0 /* add in final carry */
rldicl r4,r0,32,0 /* fold two 32 bit halves together */
add r3,r4,r0
srdi r3,r3,32
blr
EXPORT_SYMBOL(__csum_partial)
.macro srcnr
100:
EX_TABLE(100b,.Lerror_nr)
.endm
.macro source
150:
EX_TABLE(150b,.Lerror)
.endm
.macro dstnr
200:
EX_TABLE(200b,.Lerror_nr)
.endm
.macro dest
250:
EX_TABLE(250b,.Lerror)
.endm
/* * Computes the checksum of a memory block at src, length len, * and adds in 0xffffffff (32-bit), while copying the block to dst. * If an access exception occurs, it returns 0. * * csum_partial_copy_generic(r3=src, r4=dst, r5=len)
*/
_GLOBAL(csum_partial_copy_generic)
li r6,-1
addic r0,r6,0 /* clear carry */
srdi. r6,r5,3 /* less than 8 bytes? */
beq .Lcopy_tail_word
/* * If only halfword aligned, align to a double word. Since odd * aligned addresses should be rare and they would require more * work to calculate the correct checksum, we ignore that case * and take the potential slowdown of unaligned loads. * * If the source and destination are relatively unaligned we only * align the source. This keeps things simple.
*/
rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */
beq .Lcopy_aligned
.Lcopy_aligned: /* * We unroll the loop such that each iteration is 64 bytes with an * entry and exit limb of 64 bytes, meaning a minimum size of * 128 bytes.
*/
srdi. r6,r5,7
beq .Lcopy_tail_doublewords /* len < 128 */
/* * On POWER6 and POWER7 back to back adde instructions take 2 cycles * because of the XER dependency. This means the fastest this loop can * go is 16 cycles per iteration. The scheduling of the loop below has * been shown to hit this on both POWER6 and POWER7.
*/
.align 5
2:
adde r0,r0,r6
source; ld r12,32(r3)
source; ld r14,40(r3)
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.