Quelle chacha-scalar-core.S Sprache: Sparc

/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2018 Google, Inc.
*/

#include <linux/linkage.h>
#include <asm/assembler.h>

/*
* Design notes:
*
* 16 registers would be needed to hold the state matrix, but only 14 are
* available because 'sp' and 'pc' cannot be used.  So we spill the elements
* (x8, x9) to the stack and swap them out with (x10, x11).  This adds one
* 'ldrd' and one 'strd' instruction per round.
*
* All rotates are performed using the implicit rotate operand accepted by the
* 'add' and 'eor' instructions.  This is faster than using explicit rotate
* instructions.  To make this work, we allow the values in the second and last
* rows of the ChaCha state matrix (rows 'b' and 'd') to temporarily have the
* wrong rotation amount.  The rotation amount is then fixed up just in time
* when the values are used.  'brot' is the number of bits the values in row 'b'
* need to be rotated right to arrive at the correct values, and 'drot'
* similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
* that they end up as (25, 24) after every round.
*/

// ChaCha state registers
X0 .req r0
X1 .req r1
X2 .req r2
X3 .req r3
X4 .req r4
X5 .req r5
X6 .req r6
X7 .req r7
X8_X10 .req r8 // shared by x8 and x10
X9_X11 .req r9 // shared by x9 and x11
X12 .req r10
X13 .req r11
X14 .req r12
X15 .req r14

.macro _le32_bswap_4x a, b, c, d,  tmp
#ifdef __ARMEB__
rev_l  \a,  \tmp
rev_l  \b,  \tmp
rev_l  \c,  \tmp
rev_l  \d,  \tmp
#endif
.endm

.macro __ldrd  a, b, src, offset
#if __LINUX_ARM_ARCH__ >= 6
ldrd  \a, \b, [\src, #\offset]
#else
ldr  \a, [\src, #\offset]
ldr  \b, [\src, #\offset + 4]
#endif
.endm

.macro __strd  a, b, dst, offset
#if __LINUX_ARM_ARCH__ >= 6
strd  \a, \b, [\dst, #\offset]
#else
str  \a, [\dst, #\offset]
str  \b, [\dst, #\offset + 4]
#endif
.endm

.macro _halfround a1, b1, c1, d1,  a2, b2, c2, d2

// a += b; d ^= a; d = rol(d, 16);
add  \a1, \a1, \b1, ror #brot
add  \a2, \a2, \b2, ror #brot
eor  \d1, \a1, \d1, ror #drot
eor  \d2, \a2, \d2, ror #drot
// drot == 32 - 16 == 16

// c += d; b ^= c; b = rol(b, 12);
add  \c1, \c1, \d1, ror #16
add  \c2, \c2, \d2, ror #16
eor  \b1, \c1, \b1, ror #brot
eor  \b2, \c2, \b2, ror #brot
// brot == 32 - 12 == 20

// a += b; d ^= a; d = rol(d, 8);
add  \a1, \a1, \b1, ror #20
add  \a2, \a2, \b2, ror #20
eor  \d1, \a1, \d1, ror #16
eor  \d2, \a2, \d2, ror #16
// drot == 32 - 8 == 24

// c += d; b ^= c; b = rol(b, 7);
add  \c1, \c1, \d1, ror #24
add  \c2, \c2, \d2, ror #24
eor  \b1, \c1, \b1, ror #20
eor  \b2, \c2, \b2, ror #20
// brot == 32 - 7 == 25
.endm

.macro _doubleround

// column round

// quarterrounds: (x0, x4, x8, x12) and (x1, x5, x9, x13)
_halfround X0, X4, X8_X10, X12,  X1, X5, X9_X11, X13

// save (x8, x9); restore (x10, x11)
__strd  X8_X10, X9_X11, sp, 0
__ldrd  X8_X10, X9_X11, sp, 8

// quarterrounds: (x2, x6, x10, x14) and (x3, x7, x11, x15)
_halfround X2, X6, X8_X10, X14,  X3, X7, X9_X11, X15

.set brot, 25
.set drot, 24

// diagonal round

// quarterrounds: (x0, x5, x10, x15) and (x1, x6, x11, x12)
_halfround X0, X5, X8_X10, X15,  X1, X6, X9_X11, X12

// save (x10, x11); restore (x8, x9)
__strd  X8_X10, X9_X11, sp, 8
__ldrd  X8_X10, X9_X11, sp, 0

// quarterrounds: (x2, x7, x8, x13) and (x3, x4, x9, x14)
_halfround X2, X7, X8_X10, X13,  X3, X4, X9_X11, X14
.endm

.macro _chacha_permute nrounds
.set brot, 0
.set drot, 0
.rept \nrounds / 2
  _doubleround
.endr
.endm

.macro _chacha  nrounds

.Lnext_block\@:
// Stack: unused0-unused1 x10-x11 x0-x15 OUT IN LEN
// Registers contain x0-x9,x12-x15.

// Do the core ChaCha permutation to update x0-x15.
_chacha_permute \nrounds

add  sp, #8
// Stack: x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers contain x0-x9,x12-x15.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.

// Free up some registers (r8-r12,r14) by pushing (x8-x9,x12-x15).
push  {X8_X10, X9_X11, X12, X13, X14, X15}

// Load (OUT, IN, LEN).
ldr  r14, [sp, #96]
ldr  r12, [sp, #100]
ldr  r11, [sp, #104]

orr  r10, r14, r12

// Use slow path if fewer than 64 bytes remain.
cmp  r11, #64
blt  .Lxor_slowpath\@

// Use slow path if IN and/or OUT isn't 4-byte aligned. Needed even on
// ARMv6+, since ldmia and stmia (used below) still require alignment.
tst  r10, #3
bne  .Lxor_slowpath\@

// Fast path: XOR 64 bytes of aligned data.

// Stack: x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is OUT.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.

// x0-x3
__ldrd  r8, r9, sp, 32
__ldrd  r10, r11, sp, 40
add  X0, X0, r8
add  X1, X1, r9
add  X2, X2, r10
add  X3, X3, r11
_le32_bswap_4x X0, X1, X2, X3,  r8
ldmia  r12!, {r8-r11}
eor  X0, X0, r8
eor  X1, X1, r9
eor  X2, X2, r10
eor  X3, X3, r11
stmia  r14!, {X0-X3}

// x4-x7
__ldrd  r8, r9, sp, 48
__ldrd  r10, r11, sp, 56
add  X4, r8, X4, ror #brot
add  X5, r9, X5, ror #brot
ldmia  r12!, {X0-X3}
add  X6, r10, X6, ror #brot
add  X7, r11, X7, ror #brot
_le32_bswap_4x X4, X5, X6, X7,  r8
eor  X4, X4, X0
eor  X5, X5, X1
eor  X6, X6, X2
eor  X7, X7, X3
stmia  r14!, {X4-X7}

// x8-x15
pop  {r0-r7}   // (x8-x9,x12-x15,x10-x11)
__ldrd  r8, r9, sp, 32
__ldrd  r10, r11, sp, 40
add  r0, r0, r8  // x8
add  r1, r1, r9  // x9
add  r6, r6, r10  // x10
add  r7, r7, r11  // x11
_le32_bswap_4x r0, r1, r6, r7,  r8
ldmia  r12!, {r8-r11}
eor  r0, r0, r8  // x8
eor  r1, r1, r9  // x9
eor  r6, r6, r10  // x10
eor  r7, r7, r11  // x11
stmia  r14!, {r0,r1,r6,r7}
ldmia  r12!, {r0,r1,r6,r7}
__ldrd  r8, r9, sp, 48
__ldrd  r10, r11, sp, 56
add  r2, r8, r2, ror #drot // x12
add  r3, r9, r3, ror #drot // x13
add  r4, r10, r4, ror #drot // x14
add  r5, r11, r5, ror #drot // x15
_le32_bswap_4x r2, r3, r4, r5,  r9
   ldr  r9, [sp, #72]  // load LEN
eor  r2, r2, r0  // x12
eor  r3, r3, r1  // x13
eor  r4, r4, r6  // x14
eor  r5, r5, r7  // x15
   subs  r9, #64   // decrement and check LEN
stmia  r14!, {r2-r5}

beq  .Ldone\@

.Lprepare_for_next_block\@:

// Stack: x0-x15 OUT IN LEN

// Increment block counter (x12)
add  r8, #1

// Store updated (OUT, IN, LEN)
str  r14, [sp, #64]
str  r12, [sp, #68]
str  r9, [sp, #72]

   mov  r14, sp

// Store updated block counter (x12)
str  r8, [sp, #48]

   sub  sp, #16

// Reload state and do next block
ldmia  r14!, {r0-r11} // load x0-x11
__strd  r10, r11, sp, 8  // store x10-x11 before state
ldmia  r14, {r10-r12,r14} // load x12-x15
b  .Lnext_block\@

.Lxor_slowpath\@:
// Slow path: < 64 bytes remaining, or unaligned input or output buffer.
// We handle it by storing the 64 bytes of keystream to the stack, then
// XOR-ing the needed portion with the data.

// Allocate keystream buffer
sub  sp, #64
mov  r14, sp

// Stack: ks0-ks15 x8-x9 x12-x15 x10-x11 orig_x0-orig_x15 OUT IN LEN
// Registers: r0-r7 are x0-x7; r8-r11 are free; r12 is IN; r14 is &ks0.
// x4-x7 are rotated by 'brot'; x12-x15 are rotated by 'drot'.

// Save keystream for x0-x3
__ldrd  r8, r9, sp, 96
__ldrd  r10, r11, sp, 104
add  X0, X0, r8
add  X1, X1, r9
add  X2, X2, r10
add  X3, X3, r11
_le32_bswap_4x X0, X1, X2, X3,  r8
stmia  r14!, {X0-X3}

// Save keystream for x4-x7
__ldrd  r8, r9, sp, 112
__ldrd  r10, r11, sp, 120
add  X4, r8, X4, ror #brot
add  X5, r9, X5, ror #brot
add  X6, r10, X6, ror #brot
add  X7, r11, X7, ror #brot
_le32_bswap_4x X4, X5, X6, X7,  r8
   add  r8, sp, #64
stmia  r14!, {X4-X7}

// Save keystream for x8-x15
ldm  r8, {r0-r7}  // (x8-x9,x12-x15,x10-x11)
__ldrd  r8, r9, sp, 128
__ldrd  r10, r11, sp, 136
add  r0, r0, r8  // x8
add  r1, r1, r9  // x9
add  r6, r6, r10  // x10
add  r7, r7, r11  // x11
_le32_bswap_4x r0, r1, r6, r7,  r8
stmia  r14!, {r0,r1,r6,r7}
__ldrd  r8, r9, sp, 144
__ldrd  r10, r11, sp, 152
add  r2, r8, r2, ror #drot // x12
add  r3, r9, r3, ror #drot // x13
add  r4, r10, r4, ror #drot // x14
add  r5, r11, r5, ror #drot // x15
_le32_bswap_4x r2, r3, r4, r5,  r9
stmia  r14, {r2-r5}

// Stack: ks0-ks15 unused0-unused7 x0-x15 OUT IN LEN
// Registers: r8 is block counter, r12 is IN.

ldr  r9, [sp, #168]  // LEN
ldr  r14, [sp, #160]  // OUT
cmp  r9, #64
   mov  r0, sp
movle  r1, r9
movgt  r1, #64
// r1 is number of bytes to XOR, in range [1, 64]

.if __LINUX_ARM_ARCH__ < 6
orr  r2, r12, r14
tst  r2, #3   // IN or OUT misaligned?
bne  .Lxor_next_byte\@
.endif

// XOR a word at a time
.rept 16
subs  r1, #4
blt  .Lxor_words_done\@
ldr  r2, [r12], #4
ldr  r3, [r0], #4
eor  r2, r2, r3
str  r2, [r14], #4
.endr
b  .Lxor_slowpath_done\@
.Lxor_words_done\@:
ands  r1, r1, #3
beq  .Lxor_slowpath_done\@

// XOR a byte at a time
.Lxor_next_byte\@:
ldrb  r2, [r12], #1
ldrb  r3, [r0], #1
eor  r2, r2, r3
strb  r2, [r14], #1
subs  r1, #1
bne  .Lxor_next_byte\@

.Lxor_slowpath_done\@:
subs  r9, #64
add  sp, #96
bgt  .Lprepare_for_next_block\@

.Ldone\@:
.endm // _chacha

/*
* void chacha_doarm(u8 *dst, const u8 *src, unsigned int bytes,
*      const struct chacha_state *state, int nrounds);
*/
ENTRY(chacha_doarm)
cmp  r2, #0   // len == 0?
reteq  lr

ldr  ip, [sp]
cmp  ip, #12

push  {r0-r2,r4-r11,lr}

// Push state x0-x15 onto stack.
// Also store an extra copy of x10-x11 just before the state.

add  X12, r3, #48
ldm  X12, {X12,X13,X14,X15}
push  {X12,X13,X14,X15}
sub  sp, sp, #64

__ldrd  X8_X10, X9_X11, r3, 40
__strd  X8_X10, X9_X11, sp, 8
__strd  X8_X10, X9_X11, sp, 56
ldm  r3, {X0-X9_X11}
__strd  X0, X1, sp, 16
__strd  X2, X3, sp, 24
__strd  X4, X5, sp, 32
__strd  X6, X7, sp, 40
__strd  X8_X10, X9_X11, sp, 48

beq  1f
_chacha  20

0: add  sp, #76
pop  {r4-r11, pc}

1: _chacha  12
b  0b
ENDPROC(chacha_doarm)

/*
* void hchacha_block_arm(const struct chacha_state *state,
*   u32 out[HCHACHA_OUT_WORDS], int nrounds);
*/
ENTRY(hchacha_block_arm)
push  {r1,r4-r11,lr}

cmp  r2, #12   // ChaCha12 ?

mov  r14, r0
ldmia  r14!, {r0-r11} // load x0-x11
push  {r10-r11}  // store x10-x11 to stack
ldm  r14, {r10-r12,r14} // load x12-x15
sub  sp, #8

beq  1f
_chacha_permute 20

// Skip over (unused0-unused1, x10-x11)
0: add  sp, #16

// Fix up rotations of x12-x15
ror  X12, X12, #drot
ror  X13, X13, #drot
   pop  {r4}   // load 'out'
ror  X14, X14, #drot
ror  X15, X15, #drot

// Store (x0-x3,x12-x15) to 'out'
stm  r4, {X0,X1,X2,X3,X12,X13,X14,X15}

pop  {r4-r11,pc}

1: _chacha_permute 12
b  0b
ENDPROC(hchacha_block_arm)

Messung V0.5

¤ Dauer der Verarbeitung: 0.10 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.