// SPDX-License-Identifier: GPL-2.0-or-later /* * Support for AES-NI and VAES instructions. This file contains glue code. * The real AES implementations are in aesni-intel_asm.S and other .S files. * * Copyright (C) 2008, Intel Corp. * Author: Huang Ying <ying.huang@intel.com> * * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD * interface for 64-bit kernels. * Authors: Adrian Hoban <adrian.hoban@intel.com> * Gabriele Paoloni <gabriele.paoloni@intel.com> * Tadeusz Struk (tadeusz.struk@intel.com) * Aidan O'Mahony (aidan.o.mahony@intel.com) * Copyright (c) 2010, Intel Corporation. * * Copyright 2024 Google LLC
*/
/* This handles cases where the source and/or destination span pages. */ static noinline int
xts_crypt_slowpath(struct skcipher_request *req, xts_crypt_func crypt_func)
{ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); conststruct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); int tail = req->cryptlen % AES_BLOCK_SIZE; struct scatterlist sg_src[2], sg_dst[2]; struct skcipher_request subreq; struct skcipher_walk walk; struct scatterlist *src, *dst; int err;
/* * If the message length isn't divisible by the AES block size, then * separate off the last full block and the partial block. This ensures * that they are processed in the same call to the assembly function, * which is required for ciphertext stealing.
*/ if (tail) {
skcipher_request_set_tfm(&subreq, tfm);
skcipher_request_set_callback(&subreq,
skcipher_request_flags(req),
NULL, NULL);
skcipher_request_set_crypt(&subreq, req->src, req->dst,
req->cryptlen - tail - AES_BLOCK_SIZE,
req->iv);
req = &subreq;
}
/* * In practice, virtually all XTS plaintexts and ciphertexts are either * 512 or 4096 bytes and do not use multiple scatterlist elements. To * optimize the performance of these cases, the below fast-path handles * single-scatterlist-element messages as efficiently as possible. The * code is 64-bit specific, as it assumes no page mapping is needed.
*/ if (IS_ENABLED(CONFIG_X86_64) &&
likely(req->src->length >= req->cryptlen &&
req->dst->length >= req->cryptlen)) {
(*crypt_func)(&ctx->crypt_ctx, sg_virt(req->src),
sg_virt(req->dst), req->cryptlen, req->iv);
kernel_fpu_end(); return 0;
}
kernel_fpu_end(); return xts_crypt_slowpath(req, crypt_func);
}
while ((nbytes = walk.nbytes) != 0) { if (nbytes < walk.total) { /* Not the end yet, so keep the length block-aligned. */
nbytes = round_down(nbytes, AES_BLOCK_SIZE);
nblocks = nbytes / AES_BLOCK_SIZE;
} else { /* It's the end, so include any final partial block. */
nblocks = DIV_ROUND_UP(nbytes, AES_BLOCK_SIZE);
}
ctr64 += nblocks;
kernel_fpu_begin(); if (likely(ctr64 >= nblocks)) { /* The low 64 bits of the counter won't overflow. */
(*ctr64_func)(key, walk.src.virt.addr,
walk.dst.virt.addr, nbytes, le_ctr);
} else { /* * The low 64 bits of the counter will overflow. The * assembly doesn't handle this case, so split the * operation into two at the point where the overflow * will occur. After the first part, add the carry bit.
*/
p1_nbytes = min_t(unsignedint, nbytes,
(nblocks - ctr64) * AES_BLOCK_SIZE);
(*ctr64_func)(key, walk.src.virt.addr,
walk.dst.virt.addr, p1_nbytes, le_ctr);
le_ctr[0] = 0;
le_ctr[1]++;
(*ctr64_func)(key, walk.src.virt.addr + p1_nbytes,
walk.dst.virt.addr + p1_nbytes,
nbytes - p1_nbytes, le_ctr);
}
kernel_fpu_end();
le_ctr[0] = ctr64;
/* The common part of the x86_64 AES-GCM key struct */ struct aes_gcm_key { /* Expanded AES key and the AES key length in bytes */ struct crypto_aes_ctx aes_key;
/* RFC4106 nonce (used only by the rfc4106 algorithms) */
u32 rfc4106_nonce;
};
/* Key struct used by the AES-NI implementations of AES-GCM */ struct aes_gcm_key_aesni { /* * Common part of the key. The assembly code requires 16-byte alignment * for the round keys; we get this by them being located at the start of * the struct and the whole struct being 16-byte aligned.
*/ struct aes_gcm_key base;
/* * Powers of the hash key H^8 through H^1. These are 128-bit values. * They all have an extra factor of x^-1 and are byte-reversed. 16-byte * alignment is required by the assembly code.
*/
u64 h_powers[8][2] __aligned(16);
/* * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd * together. It's used for Karatsuba multiplication. 16-byte alignment * is required by the assembly code.
*/
u64 h_powers_xored[8] __aligned(16);
/* * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte * alignment is required by the assembly code.
*/
u64 h_times_x64[2] __aligned(16);
}; #define AES_GCM_KEY_AESNI(key) \
container_of((key), struct aes_gcm_key_aesni, base) #define AES_GCM_KEY_AESNI_SIZE \
(sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1)))
/* Key struct used by the VAES + AVX10 implementations of AES-GCM */ struct aes_gcm_key_avx10 { /* * Common part of the key. The assembly code prefers 16-byte alignment * for the round keys; we get this by them being located at the start of * the struct and the whole struct being 64-byte aligned.
*/ struct aes_gcm_key base;
/* * Powers of the hash key H^16 through H^1. These are 128-bit values. * They all have an extra factor of x^-1 and are byte-reversed. This * array is aligned to a 64-byte boundary to make it naturally aligned * for 512-bit loads, which can improve performance. (The assembly code * doesn't *need* the alignment; this is just an optimization.)
*/
u64 h_powers[16][2] __aligned(64);
/* * These flags are passed to the AES-GCM helper functions to specify the * specific version of AES-GCM (RFC4106 or not), whether it's encryption or * decryption, and which assembly functions should be called. Assembly * functions are selected using flags instead of function pointers to avoid * indirect calls (which are very expensive on x86) regardless of inlining.
*/ #define FLAG_RFC4106 BIT(0) #define FLAG_ENC BIT(1) #define FLAG_AVX BIT(2) #ifdefined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) # define FLAG_AVX10_256 BIT(3) # define FLAG_AVX10_512 BIT(4) #else /* * This should cause all calls to the AVX10 assembly functions to be * optimized out, avoiding the need to ifdef each call individually.
*/ # define FLAG_AVX10_256 0 # define FLAG_AVX10_512 0 #endif
staticvoid aes_gcm_precompute(struct aes_gcm_key *key, int flags)
{ /* * To make things a bit easier on the assembly side, the AVX10 * implementations use the same key format. Therefore, a single * function using 256-bit vectors would suffice here. However, it's * straightforward to provide a 512-bit one because of how the assembly * code is structured, and it works nicely because the total size of the * key powers is a multiple of 512 bits. So we take advantage of that. * * A similar situation applies to the AES-NI implementations.
*/ if (flags & FLAG_AVX10_512)
aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); elseif (flags & FLAG_AVX10_256)
aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); elseif (flags & FLAG_AVX)
aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); else
aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key));
}
/* __always_inline to optimize out the branches based on @flags */ static __always_inline bool __must_check
aes_gcm_dec_final(conststruct aes_gcm_key *key, const u32 le_ctr[4],
u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen,
u8 tag[16], int taglen, int flags)
{ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key),
le_ctr, ghash_acc,
total_aadlen, total_datalen,
tag, taglen); elseif (flags & FLAG_AVX) return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key),
le_ctr, ghash_acc,
total_aadlen, total_datalen,
tag, taglen); else return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key),
le_ctr, ghash_acc,
total_aadlen, total_datalen,
tag, taglen);
}
/* * This is the Integrity Check Value (aka the authentication tag) length and can * be 8, 12 or 16 bytes long.
*/ staticint common_rfc4106_set_authsize(struct crypto_aead *aead, unsignedint authsize)
{ switch (authsize) { case 8: case 12: case 16: break; default: return -EINVAL;
}
return 0;
}
staticint generic_gcmaes_set_authsize(struct crypto_aead *tfm, unsignedint authsize)
{ switch (authsize) { case 4: case 8: case 12: case 13: case 14: case 15: case 16: break; default: return -EINVAL;
}
return 0;
}
/* * This is the setkey function for the x86_64 implementations of AES-GCM. It * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes * powers of the hash key. * * To comply with the crypto_aead API, this has to be usable in no-SIMD context. * For that reason, this function includes a portable C implementation of the * needed logic. However, the portable C implementation is very slow, taking * about the same time as encrypting 37 KB of data. To be ready for users that * may set a key even somewhat frequently, we therefore also include a SIMD * assembly implementation, expanding the AES key using AES-NI and precomputing * the hash key powers using PCLMULQDQ or VPCLMULQDQ.
*/ staticint gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, unsignedint keylen, int flags)
{ struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); int err;
if (flags & FLAG_RFC4106) { if (keylen < 4) return -EINVAL;
keylen -= 4;
key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen);
}
/* * Initialize @ghash_acc, then pass all @assoclen bytes of associated data * (a.k.a. additional authenticated data) from @sg_src through the GHASH update * assembly function. kernel_fpu_begin() must have already been called.
*/ staticvoid gcm_process_assoc(conststruct aes_gcm_key *key, u8 ghash_acc[16], struct scatterlist *sg_src, unsignedint assoclen, int flags)
{ struct scatter_walk walk; /* * The assembly function requires that the length of any non-last * segment of associated data be a multiple of 16 bytes, so this * function does the buffering needed to achieve that.
*/ unsignedint pos = 0;
u8 buf[16];
/* Begin walking through the plaintext or ciphertext. */ if (flags & FLAG_ENC)
err = skcipher_walk_aead_encrypt(&walk, req, false); else
err = skcipher_walk_aead_decrypt(&walk, req, false); if (err) return err;
/* * Since the AES-GCM assembly code requires that at least three assembly * functions be called to process any message (this is needed to support * incremental updates cleanly), to reduce overhead we try to do all * three calls in the same kernel FPU section if possible. We close the * section and start a new one if there are multiple data segments or if * rescheduling is needed while processing the associated data.
*/
kernel_fpu_begin();
/* Pass the associated data through GHASH. */
gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags);
/* En/decrypt the data and pass the ciphertext through GHASH. */ while (unlikely((nbytes = walk.nbytes) < walk.total)) { /* * Non-last segment. In this case, the assembly function * requires that the length be a multiple of 16 (AES_BLOCK_SIZE) * bytes. The needed buffering of up to 16 bytes is handled by * the skcipher_walk. Here we just need to round down to a * multiple of 16.
*/
nbytes = round_down(nbytes, AES_BLOCK_SIZE);
aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr,
walk.dst.virt.addr, nbytes, flags);
le_ctr[0] += nbytes / AES_BLOCK_SIZE;
kernel_fpu_end();
err = skcipher_walk_done(&walk, walk.nbytes - nbytes); if (err) return err;
kernel_fpu_begin();
} /* Last segment: process all remaining data. */
aes_gcm_update(key, le_ctr, ghash_acc, walk.src.virt.addr,
walk.dst.virt.addr, nbytes, flags); /* * The low word of the counter isn't used by the finalize, so there's no * need to increment it here.
*/
/* Store the computed auth tag in the dst scatterlist. */
scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen +
req->cryptlen, taglen, 1);
} else { unsignedint datalen = req->cryptlen - taglen;
u8 tag[16];
/* Get the transmitted auth tag from the src scatterlist. */
scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen,
taglen, 0); /* * Finish computing the auth tag and compare it to the * transmitted one. The assembly function does the actual tag * comparison. Here, just check the boolean result.
*/ if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen,
datalen, tag, taglen, flags))
err = -EBADMSG;
}
kernel_fpu_end(); if (nbytes)
skcipher_walk_done(&walk, 0); return err;
}
staticint __init register_avx_algs(void)
{ int err;
if (!boot_cpu_has(X86_FEATURE_AVX)) return 0;
err = crypto_register_skciphers(skcipher_algs_aesni_avx,
ARRAY_SIZE(skcipher_algs_aesni_avx)); if (err) return err;
err = crypto_register_aeads(aes_gcm_algs_aesni_avx,
ARRAY_SIZE(aes_gcm_algs_aesni_avx)); if (err) return err; /* * Note: not all the algorithms registered below actually require * VPCLMULQDQ. But in practice every CPU with VAES also has VPCLMULQDQ. * Similarly, the assembler support was added at about the same time. * For simplicity, just always check for VAES and VPCLMULQDQ together.
*/ #ifdefined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) if (!boot_cpu_has(X86_FEATURE_AVX2) ||
!boot_cpu_has(X86_FEATURE_VAES) ||
!boot_cpu_has(X86_FEATURE_VPCLMULQDQ) ||
!boot_cpu_has(X86_FEATURE_PCLMULQDQ) ||
!cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return 0;
err = crypto_register_skciphers(skcipher_algs_vaes_avx2,
ARRAY_SIZE(skcipher_algs_vaes_avx2)); if (err) return err;
err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256,
ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256)); if (err) return err;
if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { int i;
for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++)
skcipher_algs_vaes_avx512[i].base.cra_priority = 1; for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++)
aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.