/* * LD_VSR_CROSS16B load the 2nd 16 bytes for _vaddr which is unaligned with * 16 bytes boundary and permute the result with the 1st 16 bytes.
* | y y y y y y y y y y y y y 0 1 2 | 3 4 5 6 7 8 9 a b c d e f z z z | * ^ ^ ^ * 0xbbbb10 0xbbbb20 0xbbb30 * ^ * _vaddr * * * _vmask is the mask generated by LVS * _v1st_qw is the 1st aligned QW of current addr which is already loaded. * for example: 0xyyyyyyyyyyyyy012 for big endian * _v2nd_qw is the 2nd aligned QW of cur _vaddr to be loaded. * for example: 0x3456789abcdefzzz for big endian * The permute result is saved in _v_res. * for example: 0x0123456789abcdef for big endian.
*/
#define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
lvx _v2nd_qw,_vaddr,off16; \
VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
/* * There are 2 categories for memcmp: * 1) src/dst has the same offset to the 8 bytes boundary. The handlers * are named like .Lsameoffset_xxxx * 2) src/dst has different offset to the 8 bytes boundary. The handlers * are named like .Ldiffoffset_xxxx
*/
_GLOBAL_TOC(memcmp)
cmpdi cr1,r5,0
/* Use the short loop if the src/dst addresses are not * with the same offset of 8 bytes align boundary.
*/
xor r6,r3,r4
andi. r6,r6,7
/* Fall back to short loop if compare at aligned addrs * with less than 8 bytes.
*/
cmpdi cr6,r5,7
.Lsameoffset_8bytes_make_align_start: /* attempt to compare bytes not aligned with 8 bytes so that * rest comparison can run based on 8 bytes alignment.
*/
andi. r6,r3,7
/* Try to compare the first double word which is not 8 bytes aligned: * load the first double word at (src & ~7UL) and shift left appropriate * bits before comparision.
*/
rlwinm r6,r3,3,26,28
beq .Lsameoffset_8bytes_aligned
clrrdi r3,r3,3
clrrdi r4,r4,3 LD rA,0,r3 LD rB,0,r4
sld rA,rA,r6
sld rB,rB,r6
cmpld cr0,rA,rB
srwi r6,r6,3
bne cr0,.LcmpAB_lightweight
subfic r6,r6,8
subf. r5,r6,r5
addi r3,r3,8
addi r4,r4,8
beq .Lzero
.Lsameoffset_8bytes_aligned: /* now we are aligned with 8 bytes. * Use .Llong loop if left cmp bytes are equal or greater than 32B.
*/
cmpdi cr6,r5,31
bgt cr6,.Llong
.Lcmp_lt32bytes: /* compare 1 ~ 31 bytes, at least r3 addr is 8 bytes aligned now */
cmpdi cr5,r5,7
srdi r0,r5,3
ble cr5,.Lcmp_rest_lt8bytes
.Lcmp_rest_lt8bytes: /* * Here we have less than 8 bytes to compare. At least s1 is aligned to * 8 bytes, but s2 may not be. We must make sure s2 + 7 doesn't cross a * page boundary, otherwise we might read past the end of the buffer and * trigger a page fault. We use 4K as the conservative minimum page * size. If we detect that case we go to the byte-by-byte loop. * * Otherwise the next double word is loaded from s1 and s2, and shifted * right to compare the appropriate bits.
*/
clrldi r6,r4,(64-12) // r6 = r4 & 0xfff
cmpdi r6,0xff8
bgt .Lshort
.Llong:
#ifdef CONFIG_ALTIVEC
BEGIN_FTR_SECTION /* Try to use vmx loop if length is equal or greater than 4K */
cmpldi cr6,r5,VMX_THRESH
bge cr6,.Lsameoffset_vmx_cmp
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
.Llong_novmx_cmp:
#endif /* At least s1 addr is aligned with 8 bytes */
li off8,8
li off16,16
li off24,24
.LcmpAB_lightweight: /* skip NV GPRS restore */
li r3,1
bgtlr
li r3,-1
blr
#ifdef CONFIG_ALTIVEC
.Lsameoffset_vmx_cmp: /* Enter with src/dst addrs has the same offset with 8 bytes * align boundary. * * There is an optimization based on following fact: memcmp() * prones to fail early at the first 32 bytes. * Before applying VMX instructions which will lead to 32x128bits * VMX regs load/restore penalty, we compare the first 32 bytes * so that we can catch the ~80% fail cases.
*/
3: /* need to check whether r4 has the same offset with r3 * for 16 bytes boundary.
*/
xor r0,r3,r4
andi. r0,r0,0xf
bne .Ldiffoffset_vmx_cmp_start
/* len is no less than 4KB. Need to align with 16 bytes further.
*/
andi. rA,r3,8 LD rA,0,r3
beq 4f LD rB,0,r4
cmpld cr0,rA,rB
addi r3,r3,8
addi r4,r4,8
addi r5,r5,-8
beq cr0,4f /* save and restore cr0 */
mfocrf r5,128
EXIT_VMX_OPS
mtocrf 128,r5
b .LcmpAB_lightweight
4: /* compare 32 bytes for each loop */
srdi r0,r5,5
mtctr r0
clrldi r5,r5,59
li off16,16
.Ldiffoffset_align_s1_8bytes: /* now s1 is aligned with 8 bytes. */
#ifdef CONFIG_ALTIVEC
BEGIN_FTR_SECTION /* only do vmx ops when the size equal or greater than 4K bytes */
cmpdi cr5,r5,VMX_THRESH
bge cr5,.Ldiffoffset_vmx_cmp
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
.Ldiffoffset_novmx_cmp:
#endif
cmpdi cr5,r5,31
ble cr5,.Lcmp_lt32bytes
#ifdef CONFIG_ALTIVEC
b .Llong_novmx_cmp
#else
b .Llong
#endif
.Ldiffoffset_vmx_s1_16bytes_align: /* now s1 is aligned with 16 bytes */
lvx v6,0,r4
LVS v4,0,r4
srdi r6,r5,5 /* loop for 32 bytes each */
clrldi r5,r5,59
mtctr r6
.balign 16
.Ldiffoffset_vmx_32bytesloop: /* the first qw of r4 was saved in v6 */
lvx v9,0,r3
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
VCMPEQUB_RC(v7,v9,v10)
vor v6,v8,v8
bnl cr6,.Ldiffoffset_vmx_diff_found
addi r3,r3,16
addi r4,r4,16
lvx v9,0,r3
LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
VCMPEQUB_RC(v7,v9,v10)
vor v6,v8,v8
bnl cr6,.Ldiffoffset_vmx_diff_found
addi r3,r3,16
addi r4,r4,16
bdnz .Ldiffoffset_vmx_32bytesloop
EXIT_VMX_OPS
cmpdi r5,0
beq .Lzero
b .Lcmp_lt32bytes
.Ldiffoffset_vmx_diff_found:
EXIT_VMX_OPS /* anyway, the diff will appear in next 16 bytes */
li r5,16
b .Lcmp_lt32bytes
#endif
EXPORT_SYMBOL(memcmp)
Messung V0.5
¤ Dauer der Verarbeitung: 0.10 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.