# This code is taken from the OpenSSL project but the author (Andy Polyakov) # has relicensed it under the GPLv2. Therefore this program is free software; # you can redistribute it and/or modify it under the terms of the GNU General # Public License version 2 as published by the Free Software Foundation. # # The original headers, including the original license headers, are # included below for completeness.
# ==================================================================== # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see https://www.openssl.org/~appro/cryptogams/. # ====================================================================
# SHA512 block procedure for ARMv4. September 2007.
# This code is ~4.5 (four and a half) times faster than code generated # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue # Xscale PXA250 core]. # # July 2010. # # Rescheduling for dual-issue pipeline resulted in 6% improvement on # Cortex A8 core and ~40 cycles per processed byte.
# February 2011. # # Profiler-assisted and platform-specific optimization resulted in 7% # improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011. # # Add NEON implementation. On Cortex A8 it was measured to process # one byte in 23.3 cycles or ~60% faster than integer-only code.
# August 2012. # # Improve NEON performance by 12% on Snapdragon S4. In absolute # terms it's 22.6 cycles per byte, which is disappointing result. # Technical writers asserted that 3-way S4 pipeline can sustain # multiple NEON instructions per cycle, but dual NEON issue could # not be observed, see https://www.openssl.org/~appro/Snapdragon-S4.html # for further details. On side note Cortex-A15 processes one byte in # 16 cycles.
# Byte order [in]dependence. ========================================= # # Originally caller was expected to maintain specific *dword* order in # h[0-7], namely with most significant dword at *lower* address, which # was reflected in below two parameters as 0 and 4. Now caller is # expected to maintain native byte order for whole 64-bit values.
$hi="HI";
$lo="LO"; # ====================================================================
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
.Lsha512_block_data_order: #if __ARM_ARCH__<7
sub r3,pc,#8 @ sha512_block_data_order #else
adr r3,.Lsha512_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap_P
tst r12,#1
bne .LNEON #endif
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8
.global sha512_block_data_order_neon
.type sha512_block_data_order_neon,%function
.align 4
sha512_block_data_order_neon:
.LNEON:
dmb @ errata #451034 on early Cortex A8
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
VFP_ABI_PUSH
adr $Ktbl,.Lsha512_block_data_order
sub $Ktbl,$Ktbl,.Lsha512_block_data_order-K512
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___ for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___ for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vadd.i64 $A,d30 @ h+=Maj from the past
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
VFP_ABI_POP
ret @ bx lr
.size sha512_block_data_order_neon,.-sha512_block_data_order_neon #endif
___
}
$code.=<<___;
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by "
.align 2 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
.comm OPENSSL_armcap_P,4,4 #endif
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;
$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
$code =~ s/\bret\b/bx lr/gm;
open SELF,$0; while(<SELF>) {
next if (/^#!/);
last if (!s/^#/@/ and !/^$/);
print;
}
close SELF;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.