Quelle sm4-ce-core.S Sprache: Sparc

/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* SM4 Cipher Algorithm for ARMv8 with Crypto Extensions
* as specified in
* https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html
*
* Copyright (C) 2022, Alibaba Group.
* Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
*/

#include <linux/linkage.h>
#include <asm/assembler.h>
#include "sm4-ce-asm.h"

.arch armv8-a+crypto

.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \
  20, 24, 25, 26, 27, 28, 29, 30, 31
.set .Lv\b\().4s, \b
.endr

.macro sm4e, vd, vn
.inst 0xcec08400 | (.L\vn << 5) | .L\vd
.endm

.macro sm4ekey, vd, vn, vm
.inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd
.endm

/* Register macros */

#define RTMP0 v16
#define RTMP1 v17
#define RTMP2 v18
#define RTMP3 v19

#define RIV v20
#define RMAC v20
#define RMASK v21

.align 3
SYM_FUNC_START(sm4_ce_expand_key)
/* input:
*   x0: 128-bit key
*   x1: rkey_enc
*   x2: rkey_dec
*   x3: fk array
*   x4: ck array
*/
ld1  {v0.16b}, [x0];
rev32  v0.16b, v0.16b;
ld1  {v1.16b}, [x3];
/* load ck */
ld1  {v24.16b-v27.16b}, [x4], #64;
ld1  {v28.16b-v31.16b}, [x4];

/* input ^ fk */
eor  v0.16b, v0.16b, v1.16b;

sm4ekey  v0.4s, v0.4s, v24.4s;
sm4ekey  v1.4s, v0.4s, v25.4s;
sm4ekey  v2.4s, v1.4s, v26.4s;
sm4ekey  v3.4s, v2.4s, v27.4s;
sm4ekey  v4.4s, v3.4s, v28.4s;
sm4ekey  v5.4s, v4.4s, v29.4s;
sm4ekey  v6.4s, v5.4s, v30.4s;
sm4ekey  v7.4s, v6.4s, v31.4s;

adr_l  x5, .Lbswap128_mask
ld1  {v24.16b}, [x5]

st1  {v0.16b-v3.16b}, [x1], #64;
st1  {v4.16b-v7.16b}, [x1];

tbl  v16.16b, {v7.16b}, v24.16b
tbl  v17.16b, {v6.16b}, v24.16b
tbl  v18.16b, {v5.16b}, v24.16b
tbl  v19.16b, {v4.16b}, v24.16b
tbl  v20.16b, {v3.16b}, v24.16b
tbl  v21.16b, {v2.16b}, v24.16b
tbl  v22.16b, {v1.16b}, v24.16b
tbl  v23.16b, {v0.16b}, v24.16b

st1  {v16.16b-v19.16b}, [x2], #64
st1  {v20.16b-v23.16b}, [x2]

ret;
SYM_FUNC_END(sm4_ce_expand_key)

.align 3
SYM_FUNC_START(sm4_ce_crypt_block)
/* input:
*   x0: round key array, CTX
*   x1: dst
*   x2: src
*/
SM4_PREPARE(x0)

ld1  {v0.16b}, [x2];
SM4_CRYPT_BLK(v0);
st1  {v0.16b}, [x1];

ret;
SYM_FUNC_END(sm4_ce_crypt_block)

.align 3
SYM_FUNC_START(sm4_ce_crypt)
/* input:
*   x0: round key array, CTX
*   x1: dst
*   x2: src
*   w3: nblocks
*/
SM4_PREPARE(x0)

.Lcrypt_loop_blk:
sub  w3, w3, #8;
tbnz  w3, #31, .Lcrypt_tail8;

ld1  {v0.16b-v3.16b}, [x2], #64;
ld1  {v4.16b-v7.16b}, [x2], #64;

SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7);

st1  {v0.16b-v3.16b}, [x1], #64;
st1  {v4.16b-v7.16b}, [x1], #64;

cbz  w3, .Lcrypt_end;
b  .Lcrypt_loop_blk;

.Lcrypt_tail8:
add  w3, w3, #8;
cmp  w3, #4;
blt  .Lcrypt_tail4;

sub  w3, w3, #4;

ld1  {v0.16b-v3.16b}, [x2], #64;
SM4_CRYPT_BLK4(v0, v1, v2, v3);
st1  {v0.16b-v3.16b}, [x1], #64;

cbz  w3, .Lcrypt_end;

.Lcrypt_tail4:
sub  w3, w3, #1;

ld1  {v0.16b}, [x2], #16;
SM4_CRYPT_BLK(v0);
st1  {v0.16b}, [x1], #16;

cbnz  w3, .Lcrypt_tail4;

.Lcrypt_end:
ret;
SYM_FUNC_END(sm4_ce_crypt)

.align 3
SYM_FUNC_START(sm4_ce_cbc_enc)
/* input:
*   x0: round key array, CTX
*   x1: dst
*   x2: src
*   x3: iv (big endian, 128 bit)
*   w4: nblocks
*/
SM4_PREPARE(x0)

ld1  {RIV.16b}, [x3]

.Lcbc_enc_loop_4x:
cmp  w4, #4
blt  .Lcbc_enc_loop_1x

sub  w4, w4, #4

ld1  {v0.16b-v3.16b}, [x2], #64

eor  v0.16b, v0.16b, RIV.16b
SM4_CRYPT_BLK(v0)
eor  v1.16b, v1.16b, v0.16b
SM4_CRYPT_BLK(v1)
eor  v2.16b, v2.16b, v1.16b
SM4_CRYPT_BLK(v2)
eor  v3.16b, v3.16b, v2.16b
SM4_CRYPT_BLK(v3)

st1  {v0.16b-v3.16b}, [x1], #64
mov  RIV.16b, v3.16b

cbz  w4, .Lcbc_enc_end
b  .Lcbc_enc_loop_4x

.Lcbc_enc_loop_1x:
sub  w4, w4, #1

ld1  {v0.16b}, [x2], #16

eor  RIV.16b, RIV.16b, v0.16b
SM4_CRYPT_BLK(RIV)

st1  {RIV.16b}, [x1], #16

cbnz  w4, .Lcbc_enc_loop_1x

.Lcbc_enc_end:
/* store new IV */
st1  {RIV.16b}, [x3]

ret
SYM_FUNC_END(sm4_ce_cbc_enc)

.align 3
SYM_FUNC_START(sm4_ce_cbc_dec)
/* input:
*   x0: round key array, CTX
*   x1: dst
*   x2: src
*   x3: iv (big endian, 128 bit)
*   w4: nblocks
*/
SM4_PREPARE(x0)

ld1  {RIV.16b}, [x3]

.Lcbc_dec_loop_8x:
sub  w4, w4, #8
tbnz  w4, #31, .Lcbc_dec_4x

ld1  {v0.16b-v3.16b}, [x2], #64
ld1  {v4.16b-v7.16b}, [x2], #64

rev32  v8.16b, v0.16b
rev32  v9.16b, v1.16b
rev32  v10.16b, v2.16b
rev32  v11.16b, v3.16b
rev32  v12.16b, v4.16b
rev32  v13.16b, v5.16b
rev32  v14.16b, v6.16b
rev32  v15.16b, v7.16b

SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15)

eor  v8.16b, v8.16b, RIV.16b
eor  v9.16b, v9.16b, v0.16b
eor  v10.16b, v10.16b, v1.16b
eor  v11.16b, v11.16b, v2.16b
eor  v12.16b, v12.16b, v3.16b
eor  v13.16b, v13.16b, v4.16b
eor  v14.16b, v14.16b, v5.16b
eor  v15.16b, v15.16b, v6.16b

st1  {v8.16b-v11.16b}, [x1], #64
st1  {v12.16b-v15.16b}, [x1], #64

mov  RIV.16b, v7.16b

cbz  w4, .Lcbc_dec_end
b  .Lcbc_dec_loop_8x

.Lcbc_dec_4x:
add  w4, w4, #8
cmp  w4, #4
blt  .Lcbc_dec_loop_1x

sub  w4, w4, #4

ld1  {v0.16b-v3.16b}, [x2], #64

rev32  v8.16b, v0.16b
rev32  v9.16b, v1.16b
rev32  v10.16b, v2.16b
rev32  v11.16b, v3.16b

SM4_CRYPT_BLK4_BE(v8, v9, v10, v11)

eor  v8.16b, v8.16b, RIV.16b
eor  v9.16b, v9.16b, v0.16b
eor  v10.16b, v10.16b, v1.16b
eor  v11.16b, v11.16b, v2.16b

st1  {v8.16b-v11.16b}, [x1], #64

mov  RIV.16b, v3.16b

cbz  w4, .Lcbc_dec_end

.Lcbc_dec_loop_1x:
sub  w4, w4, #1

ld1  {v0.16b}, [x2], #16

rev32  v8.16b, v0.16b

SM4_CRYPT_BLK_BE(v8)

eor  v8.16b, v8.16b, RIV.16b
st1  {v8.16b}, [x1], #16

mov  RIV.16b, v0.16b

cbnz  w4, .Lcbc_dec_loop_1x

.Lcbc_dec_end:
/* store new IV */
st1  {RIV.16b}, [x3]

ret
SYM_FUNC_END(sm4_ce_cbc_dec)

.align 3
SYM_FUNC_START(sm4_ce_cbc_cts_enc)
/* input:
*   x0: round key array, CTX
*   x1: dst
*   x2: src
*   x3: iv (big endian, 128 bit)
*   w4: nbytes
*/
SM4_PREPARE(x0)

sub  w5, w4, #16
uxtw  x5, w5

ld1  {RIV.16b}, [x3]

ld1  {v0.16b}, [x2]
eor  RIV.16b, RIV.16b, v0.16b
SM4_CRYPT_BLK(RIV)

/* load permute table */
adr_l  x6, .Lcts_permute_table
add  x7, x6, #32
add  x6, x6, x5
sub  x7, x7, x5
ld1  {v3.16b}, [x6]
ld1  {v4.16b}, [x7]

/* overlapping loads */
add  x2, x2, x5
ld1  {v1.16b}, [x2]

/* create Cn from En-1 */
tbl  v0.16b, {RIV.16b}, v3.16b
/* padding Pn with zeros */
tbl  v1.16b, {v1.16b}, v4.16b

eor  v1.16b, v1.16b, RIV.16b
SM4_CRYPT_BLK(v1)

/* overlapping stores */
add  x5, x1, x5
st1  {v0.16b}, [x5]
st1  {v1.16b}, [x1]

ret
SYM_FUNC_END(sm4_ce_cbc_cts_enc)

.align 3
SYM_FUNC_START(sm4_ce_cbc_cts_dec)
/* input:
*   x0: round key array, CTX
*   x1: dst
*   x2: src
*   x3: iv (big endian, 128 bit)
*   w4: nbytes
*/
SM4_PREPARE(x0)

sub  w5, w4, #16
uxtw  x5, w5

ld1  {RIV.16b}, [x3]

/* load permute table */
adr_l  x6, .Lcts_permute_table
add  x7, x6, #32
add  x6, x6, x5
sub  x7, x7, x5
ld1  {v3.16b}, [x6]
ld1  {v4.16b}, [x7]

/* overlapping loads */
ld1  {v0.16b}, [x2], x5
ld1  {v1.16b}, [x2]

SM4_CRYPT_BLK(v0)
/* select the first Ln bytes of Xn to create Pn */
tbl  v2.16b, {v0.16b}, v3.16b
eor  v2.16b, v2.16b, v1.16b

/* overwrite the first Ln bytes with Cn to create En-1 */
tbx  v0.16b, {v1.16b}, v4.16b
SM4_CRYPT_BLK(v0)
eor  v0.16b, v0.16b, RIV.16b

/* overlapping stores */
add  x5, x1, x5
st1  {v2.16b}, [x5]
st1  {v0.16b}, [x1]

ret
SYM_FUNC_END(sm4_ce_cbc_cts_dec)

.align 3
SYM_FUNC_START(sm4_ce_ctr_enc)
/* input:
*   x0: round key array, CTX
*   x1: dst
*   x2: src
*   x3: ctr (big endian, 128 bit)
*   w4: nblocks
*/
SM4_PREPARE(x0)

ldp  x7, x8, [x3]
rev  x7, x7
rev  x8, x8

.Lctr_loop_8x:
sub  w4, w4, #8
tbnz  w4, #31, .Lctr_4x

#define inc_le128(vctr)     \
  mov  vctr.d[1], x8;  \
  mov  vctr.d[0], x7;  \
  adds  x8, x8, #1;  \
  rev64  vctr.16b, vctr.16b; \
  adc  x7, x7, xzr;

/* construct CTRs */
inc_le128(v0)   /* +0 */
inc_le128(v1)   /* +1 */
inc_le128(v2)   /* +2 */
inc_le128(v3)   /* +3 */
inc_le128(v4)   /* +4 */
inc_le128(v5)   /* +5 */
inc_le128(v6)   /* +6 */
inc_le128(v7)   /* +7 */

ld1  {v8.16b-v11.16b}, [x2], #64
ld1  {v12.16b-v15.16b}, [x2], #64

SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)

eor  v0.16b, v0.16b, v8.16b
eor  v1.16b, v1.16b, v9.16b
eor  v2.16b, v2.16b, v10.16b
eor  v3.16b, v3.16b, v11.16b
eor  v4.16b, v4.16b, v12.16b
eor  v5.16b, v5.16b, v13.16b
eor  v6.16b, v6.16b, v14.16b
eor  v7.16b, v7.16b, v15.16b

st1  {v0.16b-v3.16b}, [x1], #64
st1  {v4.16b-v7.16b}, [x1], #64

cbz  w4, .Lctr_end
b  .Lctr_loop_8x

.Lctr_4x:
add  w4, w4, #8
cmp  w4, #4
blt  .Lctr_loop_1x

sub  w4, w4, #4

/* construct CTRs */
inc_le128(v0)   /* +0 */
inc_le128(v1)   /* +1 */
inc_le128(v2)   /* +2 */
inc_le128(v3)   /* +3 */

ld1  {v8.16b-v11.16b}, [x2], #64

SM4_CRYPT_BLK4(v0, v1, v2, v3)

eor  v0.16b, v0.16b, v8.16b
eor  v1.16b, v1.16b, v9.16b
eor  v2.16b, v2.16b, v10.16b
eor  v3.16b, v3.16b, v11.16b

st1  {v0.16b-v3.16b}, [x1], #64

cbz  w4, .Lctr_end

.Lctr_loop_1x:
sub  w4, w4, #1

/* construct CTRs */
inc_le128(v0)

ld1  {v8.16b}, [x2], #16

SM4_CRYPT_BLK(v0)

eor  v0.16b, v0.16b, v8.16b
st1  {v0.16b}, [x1], #16

cbnz  w4, .Lctr_loop_1x

.Lctr_end:
/* store new CTR */
rev  x7, x7
rev  x8, x8
stp  x7, x8, [x3]

ret
SYM_FUNC_END(sm4_ce_ctr_enc)

#define tweak_next(vt, vin, RTMP)     \
  sshr  RTMP.2d, vin.2d, #63;   \
  and  RTMP.16b, RTMP.16b, RMASK.16b;  \
  add  vt.2d, vin.2d, vin.2d;   \
  ext  RTMP.16b, RTMP.16b, RTMP.16b, #8; \
  eor  vt.16b, vt.16b, RTMP.16b;

.align 3
SYM_FUNC_START(sm4_ce_xts_enc)
/* input:
*   x0: round key array, CTX
*   x1: dst
*   x2: src
*   x3: tweak (big endian, 128 bit)
*   w4: nbytes
*   x5: round key array for IV
*/
ld1  {v8.16b}, [x3]

cbz  x5, .Lxts_enc_nofirst

SM4_PREPARE(x5)

/* Generate first tweak */
SM4_CRYPT_BLK(v8)

.Lxts_enc_nofirst:
SM4_PREPARE(x0)

ands  w5, w4, #15
lsr  w4, w4, #4
sub  w6, w4, #1
csel  w4, w4, w6, eq
uxtw  x5, w5

movi  RMASK.2s, #0x1
movi  RTMP0.2s, #0x87
uzp1  RMASK.4s, RMASK.4s, RTMP0.4s

cbz  w4, .Lxts_enc_cts

.Lxts_enc_loop_8x:
sub  w4, w4, #8
tbnz  w4, #31, .Lxts_enc_4x

tweak_next( v9,  v8, RTMP0)
tweak_next(v10,  v9, RTMP1)
tweak_next(v11, v10, RTMP2)
tweak_next(v12, v11, RTMP3)
tweak_next(v13, v12, RTMP0)
tweak_next(v14, v13, RTMP1)
tweak_next(v15, v14, RTMP2)

ld1  {v0.16b-v3.16b}, [x2], #64
ld1  {v4.16b-v7.16b}, [x2], #64
eor  v0.16b, v0.16b,  v8.16b
eor  v1.16b, v1.16b,  v9.16b
eor  v2.16b, v2.16b, v10.16b
eor  v3.16b, v3.16b, v11.16b
eor  v4.16b, v4.16b, v12.16b
eor  v5.16b, v5.16b, v13.16b
eor  v6.16b, v6.16b, v14.16b
eor  v7.16b, v7.16b, v15.16b

SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)

eor  v0.16b, v0.16b,  v8.16b
eor  v1.16b, v1.16b,  v9.16b
eor  v2.16b, v2.16b, v10.16b
eor  v3.16b, v3.16b, v11.16b
eor  v4.16b, v4.16b, v12.16b
eor  v5.16b, v5.16b, v13.16b
eor  v6.16b, v6.16b, v14.16b
eor  v7.16b, v7.16b, v15.16b
st1  {v0.16b-v3.16b}, [x1], #64
st1  {v4.16b-v7.16b}, [x1], #64

tweak_next(v8, v15, RTMP3)

cbz  w4, .Lxts_enc_cts
b  .Lxts_enc_loop_8x

.Lxts_enc_4x:
add  w4, w4, #8
cmp  w4, #4
blt  .Lxts_enc_loop_1x

sub  w4, w4, #4

tweak_next( v9,  v8, RTMP0)
tweak_next(v10,  v9, RTMP1)
tweak_next(v11, v10, RTMP2)

ld1  {v0.16b-v3.16b}, [x2], #64
eor  v0.16b, v0.16b,  v8.16b
eor  v1.16b, v1.16b,  v9.16b
eor  v2.16b, v2.16b, v10.16b
eor  v3.16b, v3.16b, v11.16b

SM4_CRYPT_BLK4(v0, v1, v2, v3)

eor  v0.16b, v0.16b,  v8.16b
eor  v1.16b, v1.16b,  v9.16b
eor  v2.16b, v2.16b, v10.16b
eor  v3.16b, v3.16b, v11.16b
st1  {v0.16b-v3.16b}, [x1], #64

tweak_next(v8, v11, RTMP3)

cbz  w4, .Lxts_enc_cts

.Lxts_enc_loop_1x:
sub  w4, w4, #1

ld1  {v0.16b}, [x2], #16
eor  v0.16b, v0.16b, v8.16b

SM4_CRYPT_BLK(v0)

eor  v0.16b, v0.16b, v8.16b
st1  {v0.16b}, [x1], #16

tweak_next(v8, v8, RTMP0)

cbnz  w4, .Lxts_enc_loop_1x

.Lxts_enc_cts:
cbz  x5, .Lxts_enc_end

/* cipher text stealing */

tweak_next(v9, v8, RTMP0)
ld1  {v0.16b}, [x2]
eor  v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor  v0.16b, v0.16b, v8.16b

/* load permute table */
adr_l  x6, .Lcts_permute_table
add  x7, x6, #32
add  x6, x6, x5
sub  x7, x7, x5
ld1  {v3.16b}, [x6]
ld1  {v4.16b}, [x7]

/* overlapping loads */
add  x2, x2, x5
ld1  {v1.16b}, [x2]

/* create Cn from En-1 */
tbl  v2.16b, {v0.16b}, v3.16b
/* padding Pn with En-1 at the end */
tbx  v0.16b, {v1.16b}, v4.16b

eor  v0.16b, v0.16b, v9.16b
SM4_CRYPT_BLK(v0)
eor  v0.16b, v0.16b, v9.16b

/* overlapping stores */
add  x5, x1, x5
st1  {v2.16b}, [x5]
st1  {v0.16b}, [x1]

b  .Lxts_enc_ret

.Lxts_enc_end:
/* store new tweak */
st1  {v8.16b}, [x3]

.Lxts_enc_ret:
ret
SYM_FUNC_END(sm4_ce_xts_enc)

.align 3
SYM_FUNC_START(sm4_ce_xts_dec)
/* input:
*   x0: round key array, CTX
*   x1: dst
*   x2: src
*   x3: tweak (big endian, 128 bit)
*   w4: nbytes
*   x5: round key array for IV
*/
ld1  {v8.16b}, [x3]

cbz  x5, .Lxts_dec_nofirst

SM4_PREPARE(x5)

/* Generate first tweak */
SM4_CRYPT_BLK(v8)

.Lxts_dec_nofirst:
SM4_PREPARE(x0)

ands  w5, w4, #15
lsr  w4, w4, #4
sub  w6, w4, #1
csel  w4, w4, w6, eq
uxtw  x5, w5

movi  RMASK.2s, #0x1
movi  RTMP0.2s, #0x87
uzp1  RMASK.4s, RMASK.4s, RTMP0.4s

cbz  w4, .Lxts_dec_cts

.Lxts_dec_loop_8x:
sub  w4, w4, #8
tbnz  w4, #31, .Lxts_dec_4x

tweak_next( v9,  v8, RTMP0)
tweak_next(v10,  v9, RTMP1)
tweak_next(v11, v10, RTMP2)
tweak_next(v12, v11, RTMP3)
tweak_next(v13, v12, RTMP0)
tweak_next(v14, v13, RTMP1)
tweak_next(v15, v14, RTMP2)

ld1  {v0.16b-v3.16b}, [x2], #64
ld1  {v4.16b-v7.16b}, [x2], #64
eor  v0.16b, v0.16b,  v8.16b
eor  v1.16b, v1.16b,  v9.16b
eor  v2.16b, v2.16b, v10.16b
eor  v3.16b, v3.16b, v11.16b
eor  v4.16b, v4.16b, v12.16b
eor  v5.16b, v5.16b, v13.16b
eor  v6.16b, v6.16b, v14.16b
eor  v7.16b, v7.16b, v15.16b

SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7)

eor  v0.16b, v0.16b,  v8.16b
eor  v1.16b, v1.16b,  v9.16b
eor  v2.16b, v2.16b, v10.16b
eor  v3.16b, v3.16b, v11.16b
eor  v4.16b, v4.16b, v12.16b
eor  v5.16b, v5.16b, v13.16b
eor  v6.16b, v6.16b, v14.16b
eor  v7.16b, v7.16b, v15.16b
st1  {v0.16b-v3.16b}, [x1], #64
st1  {v4.16b-v7.16b}, [x1], #64

tweak_next(v8, v15, RTMP3)

cbz  w4, .Lxts_dec_cts
b  .Lxts_dec_loop_8x

.Lxts_dec_4x:
add  w4, w4, #8
cmp  w4, #4
blt  .Lxts_dec_loop_1x

sub  w4, w4, #4

tweak_next( v9,  v8, RTMP0)
tweak_next(v10,  v9, RTMP1)
tweak_next(v11, v10, RTMP2)

ld1  {v0.16b-v3.16b}, [x2], #64
eor  v0.16b, v0.16b,  v8.16b
eor  v1.16b, v1.16b,  v9.16b
eor  v2.16b, v2.16b, v10.16b
eor  v3.16b, v3.16b, v11.16b

SM4_CRYPT_BLK4(v0, v1, v2, v3)

eor  v0.16b, v0.16b,  v8.16b
eor  v1.16b, v1.16b,  v9.16b
eor  v2.16b, v2.16b, v10.16b
eor  v3.16b, v3.16b, v11.16b
st1  {v0.16b-v3.16b}, [x1], #64

tweak_next(v8, v11, RTMP3)

cbz  w4, .Lxts_dec_cts

.Lxts_dec_loop_1x:
sub  w4, w4, #1

ld1  {v0.16b}, [x2], #16
eor  v0.16b, v0.16b, v8.16b

SM4_CRYPT_BLK(v0)

eor  v0.16b, v0.16b, v8.16b
st1  {v0.16b}, [x1], #16

tweak_next(v8, v8, RTMP0)

cbnz  w4, .Lxts_dec_loop_1x

.Lxts_dec_cts:
cbz  x5, .Lxts_dec_end

/* cipher text stealing */

tweak_next(v9, v8, RTMP0)
ld1  {v0.16b}, [x2]
eor  v0.16b, v0.16b, v9.16b
SM4_CRYPT_BLK(v0)
eor  v0.16b, v0.16b, v9.16b

/* load permute table */
adr_l  x6, .Lcts_permute_table
add  x7, x6, #32
add  x6, x6, x5
sub  x7, x7, x5
ld1  {v3.16b}, [x6]
ld1  {v4.16b}, [x7]

/* overlapping loads */
add  x2, x2, x5
ld1  {v1.16b}, [x2]

/* create Cn from En-1 */
tbl  v2.16b, {v0.16b}, v3.16b
/* padding Pn with En-1 at the end */
tbx  v0.16b, {v1.16b}, v4.16b

eor  v0.16b, v0.16b, v8.16b
SM4_CRYPT_BLK(v0)
eor  v0.16b, v0.16b, v8.16b

/* overlapping stores */
add  x5, x1, x5
st1  {v2.16b}, [x5]
st1  {v0.16b}, [x1]

b  .Lxts_dec_ret

.Lxts_dec_end:
/* store new tweak */
st1  {v8.16b}, [x3]

.Lxts_dec_ret:
ret
SYM_FUNC_END(sm4_ce_xts_dec)

.align 3
SYM_FUNC_START(sm4_ce_mac_update)
/* input:
*   x0: round key array, CTX
*   x1: digest
*   x2: src
*   w3: nblocks
*   w4: enc_before
*   w5: enc_after
*/
SM4_PREPARE(x0)

ld1  {RMAC.16b}, [x1]

cbz  w4, .Lmac_update

SM4_CRYPT_BLK(RMAC)

.Lmac_update:
cbz  w3, .Lmac_ret

sub  w6, w3, #1
cmp  w5, wzr
csel  w3, w3, w6, ne

cbz  w3, .Lmac_end

.Lmac_loop_4x:
cmp  w3, #4
blt  .Lmac_loop_1x

sub  w3, w3, #4

ld1  {v0.16b-v3.16b}, [x2], #64

eor  RMAC.16b, RMAC.16b, v0.16b
SM4_CRYPT_BLK(RMAC)
eor  RMAC.16b, RMAC.16b, v1.16b
SM4_CRYPT_BLK(RMAC)
eor  RMAC.16b, RMAC.16b, v2.16b
SM4_CRYPT_BLK(RMAC)
eor  RMAC.16b, RMAC.16b, v3.16b
SM4_CRYPT_BLK(RMAC)

cbz  w3, .Lmac_end
b  .Lmac_loop_4x

.Lmac_loop_1x:
sub  w3, w3, #1

ld1  {v0.16b}, [x2], #16

eor  RMAC.16b, RMAC.16b, v0.16b
SM4_CRYPT_BLK(RMAC)

cbnz  w3, .Lmac_loop_1x

.Lmac_end:
cbnz  w5, .Lmac_ret

ld1  {v0.16b}, [x2], #16
eor  RMAC.16b, RMAC.16b, v0.16b

.Lmac_ret:
st1  {RMAC.16b}, [x1]
ret
SYM_FUNC_END(sm4_ce_mac_update)

.section ".rodata", "a"
.align 4
.Lbswap128_mask:
.byte  0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b
.byte  0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03

.Lcts_permute_table:
.byte  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte   0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
.byte   0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf
.byte  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
.byte  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff

Messung V0.5

¤ Dauer der Verarbeitung: 0.8 Sekunden ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.