/* * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
/** * This AVX/AVX2 add mask generation can be used for multiple duties: * 1.) Provide +0/+1 counter increments by loading 256 bits * at offset 0 * 2.) Provide +2/+2 counter increments for the second set * of 4 AVX2 registers at offset 32 (256-bit load) * 3.) Provide a +1 increment for the second set of 4 AVX * registers at offset 16 (128-bit load)
*/
ATTRIBUTE_ALIGNED(64) uint64_t CC20_COUNTER_ADD_AVX[] = {
0x0000000000000000UL, 0x0000000000000000UL,
0x0000000000000001UL, 0x0000000000000000UL,
0x0000000000000002UL, 0x0000000000000000UL,
0x0000000000000002UL, 0x0000000000000000UL,
}; static address chacha20_ctradd_avx() { return (address)CC20_COUNTER_ADD_AVX;
}
/** * Add masks for 4-block ChaCha20 Block calculations * The first 512 bits creates a +0/+1/+2/+3 add overlay. * The second 512 bits is a +4/+4/+4/+4 add overlay. This * can be used to increment the counter fields for the next 4 blocks.
*/
ATTRIBUTE_ALIGNED(64) uint64_t CC20_COUNTER_ADD_AVX512[] = {
0x0000000000000000UL, 0x0000000000000000UL,
0x0000000000000001UL, 0x0000000000000000UL,
0x0000000000000002UL, 0x0000000000000000UL,
0x0000000000000003UL, 0x0000000000000000UL,
/** * The first 256 bits represents a byte-wise permutation * for an 8-bit left-rotation on 32-bit lanes. * The second 256 bits is a 16-bit rotation on 32-bit lanes.
*/
ATTRIBUTE_ALIGNED(64) uint64_t CC20_LROT_CONSTS[] = {
0x0605040702010003UL, 0x0E0D0C0F0A09080BUL,
0x0605040702010003UL, 0x0E0D0C0F0A09080BUL,
// This function will only be called if AVX2 or AVX are supported // AVX512 uses a different function. if (VM_Version::supports_avx2()) {
vector_len = Assembler::AVX_256bit;
outlen = 256;
} elseif (VM_Version::supports_avx()) {
vector_len = Assembler::AVX_128bit;
outlen = 128;
}
__ enter();
// Load the initial state in columnar orientation and then copy // that starting state to the working register set. // Also load the address of the add mask for later use in handling // multi-block counter increments.
__ lea(rotAddr, ExternalAddress(chacha20_lrot_consts()));
__ lea(rax, ExternalAddress(chacha20_ctradd_avx())); if (vector_len == Assembler::AVX_128bit) {
__ movdqu(aState, Address(state, 0)); // Bytes 0 - 15 -> a1Vec
__ movdqu(bState, Address(state, 16)); // Bytes 16 - 31 -> b1Vec
__ movdqu(cState, Address(state, 32)); // Bytes 32 - 47 -> c1Vec
__ movdqu(dState, Address(state, 48)); // Bytes 48 - 63 -> d1Vec
__ movdqu(a2Vec, aState);
__ movdqu(b2Vec, bState);
__ movdqu(c2Vec, cState);
__ vpaddd(d2State, dState, Address(rax, 16), vector_len);
__ movdqu(d2Vec, d2State);
__ movdqu(lrot8, Address(rotAddr, 0)); // Load 8-bit lrot const
__ movdqu(lrot16, Address(rotAddr, 32)); // Load 16-bit lrot const
} else { // We will broadcast each 128-bit segment of the state array into // the high and low halves of ymm state registers. Then apply the add // mask to the dState register. These will then be copied into the // a/b/c/d1Vec working registers.
__ vbroadcastf128(aState, Address(state, 0), vector_len);
__ vbroadcastf128(bState, Address(state, 16), vector_len);
__ vbroadcastf128(cState, Address(state, 32), vector_len);
__ vbroadcastf128(dState, Address(state, 48), vector_len);
__ vpaddd(dState, dState, Address(rax, 0), vector_len);
__ vpaddd(d2State, dState, Address(rax, 32), vector_len);
__ movl(loopCounter, 10); // Set 10 2-round iterations
__ BIND(L_twoRounds);
// The first quarter round macro call covers the first 4 QR operations: // Qround(state, 0, 4, 8,12) // Qround(state, 1, 5, 9,13) // Qround(state, 2, 6,10,14) // Qround(state, 3, 7,11,15)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
lrot8, lrot16, vector_len);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
lrot8, lrot16, vector_len);
// Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors // to diagonals. The a1Vec does not need to change orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, vector_len, true);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, vector_len, true);
// The second set of operations on the vectors covers the second 4 quarter // round operations, now acting on the diagonals: // Qround(state, 0, 5,10,15) // Qround(state, 1, 6,11,12) // Qround(state, 2, 7, 8,13) // Qround(state, 3, 4, 9,14)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
lrot8, lrot16, vector_len);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
lrot8, lrot16, vector_len);
// Before we start the next iteration, we need to perform shuffles // on the b/c/d vectors to move them back to columnar organizations // from their current diagonal orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, vector_len, false);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, vector_len, false);
// Add the original start state back into the current state.
__ vpaddd(a1Vec, a1Vec, aState, vector_len);
__ vpaddd(b1Vec, b1Vec, bState, vector_len);
__ vpaddd(c1Vec, c1Vec, cState, vector_len);
__ vpaddd(d1Vec, d1Vec, dState, vector_len);
// This function will always write 128 or 256 bytes into the // key stream buffer, depending on the length of the SIMD // registers. That length should be returned through %rax.
__ mov64(rax, outlen);
// Load the initial state in columnar orientation. // We will broadcast each 128-bit segment of the state array into // all four double-quadword slots on ZMM State registers. They will // be copied into the working ZMM registers and then added back in // at the very end of the block function. The add mask should be // applied to the dState register so it does not need to be fetched // when adding the start state back into the final working state.
__ lea(rax, ExternalAddress(chacha20_ctradd_avx512()));
__ evbroadcasti32x4(aState, Address(state, 0), Assembler::AVX_512bit);
__ evbroadcasti32x4(bState, Address(state, 16), Assembler::AVX_512bit);
__ evbroadcasti32x4(cState, Address(state, 32), Assembler::AVX_512bit);
__ evbroadcasti32x4(dState, Address(state, 48), Assembler::AVX_512bit);
__ vpaddd(dState, dState, Address(rax, 0), Assembler::AVX_512bit);
__ evmovdqul(scratch, Address(rax, 64), Assembler::AVX_512bit);
__ vpaddd(d2State, dState, scratch, Assembler::AVX_512bit);
__ vpaddd(d3State, d2State, scratch, Assembler::AVX_512bit);
__ vpaddd(d4State, d3State, scratch, Assembler::AVX_512bit);
__ movl(loopCounter, 10); // Set 10 2-round iterations
__ BIND(L_twoRounds);
// The first set of operations on the vectors covers the first 4 quarter // round operations: // Qround(state, 0, 4, 8,12) // Qround(state, 1, 5, 9,13) // Qround(state, 2, 6,10,14) // Qround(state, 3, 7,11,15)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a3Vec, b3Vec, c3Vec, d3Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a4Vec, b4Vec, c4Vec, d4Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
// Shuffle the b1Vec/c1Vec/d1Vec to reorganize the state vectors // to diagonals. The a1Vec does not need to change orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, Assembler::AVX_512bit, true);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, Assembler::AVX_512bit, true);
cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, Assembler::AVX_512bit, true);
cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, Assembler::AVX_512bit, true);
// The second set of operations on the vectors covers the second 4 quarter // round operations, now acting on the diagonals: // Qround(state, 0, 5,10,15) // Qround(state, 1, 6,11,12) // Qround(state, 2, 7, 8,13) // Qround(state, 3, 4, 9,14)
cc20_quarter_round_avx(a1Vec, b1Vec, c1Vec, d1Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a2Vec, b2Vec, c2Vec, d2Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a3Vec, b3Vec, c3Vec, d3Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
cc20_quarter_round_avx(a4Vec, b4Vec, c4Vec, d4Vec, scratch,
xnoreg, xnoreg, Assembler::AVX_512bit);
// Before we start the next iteration, we need to perform shuffles // on the b/c/d vectors to move them back to columnar organizations // from their current diagonal orientation.
cc20_shift_lane_org(b1Vec, c1Vec, d1Vec, Assembler::AVX_512bit, false);
cc20_shift_lane_org(b2Vec, c2Vec, d2Vec, Assembler::AVX_512bit, false);
cc20_shift_lane_org(b3Vec, c3Vec, d3Vec, Assembler::AVX_512bit, false);
cc20_shift_lane_org(b4Vec, c4Vec, d4Vec, Assembler::AVX_512bit, false);
// Add the initial state now held on the a/b/c/dState registers to the // final working register values. We will also add in the counter add // mask onto zmm3 after adding in the start state.
__ vpaddd(a1Vec, a1Vec, aState, Assembler::AVX_512bit);
__ vpaddd(b1Vec, b1Vec, bState, Assembler::AVX_512bit);
__ vpaddd(c1Vec, c1Vec, cState, Assembler::AVX_512bit);
__ vpaddd(d1Vec, d1Vec, dState, Assembler::AVX_512bit);
// Write the ZMM state registers out to the key stream buffer // Each ZMM is divided into 4 128-bit segments. Each segment // is written to memory at 64-byte displacements from one // another. The result is that all 4 blocks will be in their // proper order when serialized.
cc20_keystream_collate_avx512(a1Vec, b1Vec, c1Vec, d1Vec, result, 0);
cc20_keystream_collate_avx512(a2Vec, b2Vec, c2Vec, d2Vec, result, 256);
cc20_keystream_collate_avx512(a3Vec, b3Vec, c3Vec, d3Vec, result, 512);
cc20_keystream_collate_avx512(a4Vec, b4Vec, c4Vec, d4Vec, result, 768);
// This function will always write 1024 bytes into the key stream buffer // and that length should be returned through %rax.
__ mov64(rax, 1024);
__ leave();
__ ret(0); return start;
}
/** * Provide a function that implements the ChaCha20 quarter round function. * * @param aVec the SIMD register containing only the "a" values * @param bVec the SIMD register containing only the "b" values * @param cVec the SIMD register containing only the "c" values * @param dVec the SIMD register containing only the "d" values * @param scratch SIMD register used for non-byte-aligned left rotations * @param lrot8 shuffle control mask for an 8-byte left rotation (32-bit lane) * @param lrot16 shuffle control mask for a 16-byte left rotation (32-bit lane) * @param vector_len the length of the vector
*/ void StubGenerator::cc20_quarter_round_avx(XMMRegister aVec, XMMRegister bVec,
XMMRegister cVec, XMMRegister dVec, XMMRegister scratch,
XMMRegister lrot8, XMMRegister lrot16, int vector_len) {
// a += b; d ^= a; d <<<= 16
__ vpaddd(aVec, aVec, bVec, vector_len);
__ vpxor(dVec, dVec, aVec, vector_len); if (vector_len == Assembler::AVX_512bit) {
__ evprold(dVec, dVec, 16, vector_len);
} else {
__ vpshufb(dVec, dVec, lrot16, vector_len);
}
/** * Shift the b, c, and d vectors between columnar and diagonal representations. * Note that the "a" vector does not shift. * * @param bVec the SIMD register containing only the "b" values * @param cVec the SIMD register containing only the "c" values * @param dVec the SIMD register containing only the "d" values * @param vector_len the size of the SIMD register to operate upon * @param colToDiag true if moving columnar to diagonal, false if * moving diagonal back to columnar.
*/ void StubGenerator::cc20_shift_lane_org(XMMRegister bVec, XMMRegister cVec,
XMMRegister dVec, int vector_len, bool colToDiag) { int bShift = colToDiag ? 0x39 : 0x93; int cShift = 0x4E; int dShift = colToDiag ? 0x93 : 0x39;
/** * Write 256 bytes of keystream output held in 4 AVX512 SIMD registers * in a quarter round parallel organization. * * @param aVec the SIMD register containing only the "a" values * @param bVec the SIMD register containing only the "b" values * @param cVec the SIMD register containing only the "c" values * @param dVec the SIMD register containing only the "d" values * @param baseAddr the register holding the base output address * @param baseOffset the offset from baseAddr for writes
*/ void StubGenerator::cc20_keystream_collate_avx512(XMMRegister aVec, XMMRegister
bVec,
XMMRegister cVec, XMMRegister dVec, Register baseAddr, int baseOffset) {
__ vextracti32x4(Address(baseAddr, baseOffset + 0), aVec, 0);
__ vextracti32x4(Address(baseAddr, baseOffset + 64), aVec, 1);
__ vextracti32x4(Address(baseAddr, baseOffset + 128), aVec, 2);
__ vextracti32x4(Address(baseAddr, baseOffset + 192), aVec, 3);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.