/* * Design notes: * * 16 registers would be needed to hold the state matrix, but only 14 are * available because 'sp' and 'pc' cannot be used. So we spill the elements * (x8, x9) to the stack and swap them out with (x10, x11). This adds one * 'ldrd' and one 'strd' instruction per round. * * All rotates are performed using the implicit rotate operand accepted by the * 'add' and 'eor' instructions. This is faster than using explicit rotate * instructions. To make this work, we allow the values in the second and last * rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the * wrong rotation amount. The rotation amount is then fixed up just in time * when the values are used. 'brot' is the number of bits the values in row 'b' * need to be rotated right to arrive at the correct values, and 'drot' * similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such * that they end up as (25, 24) after every round.
*/
.Lnext_block\@:
// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
// Registers contain x0-x9,x12-x15.
// Do the core ChaCha permutation to update x0-x15.
_chacha_permute \nrounds
add sp, #8
// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers contain x0-x9,x12-x15.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
push {X8_X10, X9_X11, X12, X13, X14, X15}
// Use slow path if fewer than 64 bytes remain.
cmp r11, #64
blt .Lxor_slowpath\@
// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
// ARMv6+, since ldmia and stmia (used below) still require alignment.
tst r10, #3
bne .Lxor_slowpath\@
// Fast path: XOR 64 bytes of aligned data.
// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
// Store updated (OUT, IN, LEN)
str r14, [sp, #64]
str r12, [sp, #68]
str r9, [sp, #72]
mov r14, sp
// Store updated block counter (x12)
str r8, [sp, #48]
sub sp, #16
// Reload state and do next block
ldmia r14!, {r0-r11} // load x0-x11
__strd r10, r11, sp, 8 // store x10-x11 before state
ldmia r14, {r10-r12,r14} // load x12-x15
b .Lnext_block\@
.Lxor_slowpath\@:
// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
// We handle it by storing the 64 bytes of keystream to the stack, then
// XOR-ing the needed portion with the data.
// Allocate keystream buffer sub sp, #64
mov r14, sp
// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.
// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
// Registers: r8 is block counter, r12 is IN.
ldr r9, [sp, #168] // LEN
ldr r14, [sp, #160] // OUT
cmp r9, #64
mov r0, sp
movle r1, r9
movgt r1, #64
// r1 is number of bytes to XOR, in range [1, 64]
.if __LINUX_ARM_ARCH__ < 6
orr r2, r12, r14
tst r2, #3 // IN or OUT misaligned?
bne .Lxor_next_byte\@
.endif
// XOR a word at a time
.rept 16
subs r1, #4
blt .Lxor_words_done\@
ldr r2, [r12], #4
ldr r3, [r0], #4
eor r2, r2, r3
str r2, [r14], #4
.endr
b .Lxor_slowpath_done\@
.Lxor_words_done\@:
ands r1, r1, #3
beq .Lxor_slowpath_done\@
// XOR a byte at a time
.Lxor_next_byte\@:
ldrb r2, [r12], #1
ldrb r3, [r0], #1
eor r2, r2, r3
strb r2, [r14], #1
subs r1, #1
bne .Lxor_next_byte\@
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.