Impressum stubGenerator_x86_64_poly.cpp
Sprache: C
/* * Copyright (c) 2022, Intel Corporation. All rights reserved. * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// References: // - (Normative) RFC7539 - ChaCha20 and Poly1305 for IETF Protocols // - M. Goll and S. Gueron, "Vectorization of Poly1305 Message Authentication Code" // - "The design of Poly1305" https://loup-vaillant.fr/tutorials/poly1305-design
// Explanation for the 'well known' modular arithmetic optimization, reduction by pseudo-Mersene prime 2^130-5: // // Reduction by 2^130-5 can be expressed as follows: // ( a×2^130 + b ) mod 2^130-5 //i.e. number split along the 130-bit boundary // = ( a×2^130 - 5×a + 5×a + b ) mod 2^130-5 // = ( a×(2^130 - 5) + 5×a + b ) mod 2^130-5 // i.e. adding multiples of modulus is a noop // = ( 5×a + b ) mod 2^130-5 // QED: shows mathematically the well known algorithm of 'split the number down the middle, multiply upper and add' // This is particularly useful to understand when combining with 'odd-sized' limbs that might cause misallignment //
// Pseudocode for this file (in general): // * used for poly1305_multiply_scalar // × used for poly1305_multiply8_avx512 // lower-case variables are scalar numbers in 3×44-bit limbs (in gprs) // upper-case variables are 8-element vector numbers in 3×44-bit limbs (in zmm registers) // [ ] used to denote vector numbers (with their elements)
// At this point, 3 64-bit limbs are in t2:a1:a0 // t2 can span over more than 2 bits so final partial reduction step is needed. // // Partial reduction (just to fit into 130 bits) // a2 = t2 & 3 // k = (t2 & ~3) + (t2 >> 2) // Y x4 + Y x1 // a2:a1:a0 += k // // Result will be in a2:a1:a0
__ movq(t0, t2);
__ movl(a2, t2); // DWORD
__ andq(t0, ~3);
__ shrq(t2, 2);
__ addq(t0, t2);
__ andl(a2, 3); // DWORD
// a2:a1:a0 += k (kept in t0)
__ addq(a0, t0);
__ adcq(a1, 0);
__ adcl(a2, 0); // DWORD
}
// Highest 42-bit limbs of new blocks
__ vpsrlq(L2, TMP, 24, Assembler::AVX_512bit); if (padMSG) {
__ evporq(L2, L2, ExternalAddress(poly1305_pad_msg()), Assembler::AVX_512bit, rscratch); // Add 2^128 to all 8 final qwords of the message
}
// Middle 44-bit limbs of new blocks
__ vpsrlq(L1, L0, 44, Assembler::AVX_512bit);
__ vpsllq(TMP, TMP, 20, Assembler::AVX_512bit);
__ vpternlogq(L1, 0xA8, TMP, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch); // (A OR B AND C)
// Lowest 44-bit limbs of new blocks
__ evpandq(L0, L0, ExternalAddress(poly1305_mask44()), Assembler::AVX_512bit, rscratch);
}
/** * Copy 5×26-bit (unreduced) limbs stored at Register limbs into a2:a1:a0 (3×64-bit limbs) * * a2 is optional. When only128 is set, limbs are expected to fit into 128-bits (i.e. a1:a0 such as clamped R)
*/ void StubGenerator::poly1305_limbs( constRegister limbs, constRegister a0, constRegister a1, constRegister a2, constRegister t0, constRegister t1)
{
__ movq(a0, Address(limbs, 0));
__ movq(t0, Address(limbs, 8));
__ shlq(t0, 26);
__ addq(a0, t0);
__ movq(t0, Address(limbs, 16));
__ movq(t1, Address(limbs, 24));
__ movq(a1, t0);
__ shlq(t0, 52);
__ shrq(a1, 12);
__ shlq(t1, 14);
__ addq(a0, t0);
__ adcq(a1, t1);
__ movq(t0, Address(limbs, 32)); if (a2 != noreg) {
__ movq(a2, t0);
__ shrq(a2, 24);
}
__ shlq(t0, 40);
__ addq(a1, t0); if (a2 != noreg) {
__ adcq(a2, 0);
// One round of reduction // Take bits above 130 in a2, multiply by 5 and add to a2:a1:a0
__ movq(t0, a2);
__ andq(t0, ~3);
__ andq(a2, 3);
__ movq(t1, t0);
__ shrq(t1, 2);
__ addq(t0, t1);
__ shrq(a0, 26); // 12 bits left in a0, concatenate 14 from a1
__ movl(t0, a1);
__ shll(t0, 12);
__ addl(t0, a0);
__ andl(t0, 0x3ffffff);
__ movq(Address(limbs, 16), t0);
__ shrq(a1, 14); // already used up 14 bits
__ shlq(a2, 50); // a2 contains 2 bits when reduced, but $Element.limbs dont have to be fully reduced
__ addq(a1, a2); // put remaining bits into a1
// To add accumulator, we must unroll first loop iteration
// Load first block of data (128 bytes) and pad // A0 to have bits 0-43 of all 8 blocks in 8 qwords // A1 to have bits 87-44 of all 8 blocks in 8 qwords // A2 to have bits 127-88 of all 8 blocks in 8 qwords
__ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit);
__ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit);
poly1305_limbs_avx512(D0, D1, A0, A1, A2, true, TMP, t1 /*rscratch*/);
// Load next blocks of data (128 bytes) and pad // A3 to have bits 0-43 of all 8 blocks in 8 qwords // A4 to have bits 87-44 of all 8 blocks in 8 qwords // A5 to have bits 127-88 of all 8 blocks in 8 qwords
__ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit);
__ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit);
poly1305_limbs_avx512(D0, D1, A3, A4, A5, true, TMP, t1 /*rscratch*/);
// Compute the powers of R^1..R^4 and form 44-bit limbs of each // T0 to have bits 0-127 in 4 quadword pairs // T1 to have bits 128-129 in alternating 8 qwords
__ vpxorq(T1, T1, T1, Assembler::AVX_512bit);
__ movq(T2, r0);
__ vpinsrq(T2, T2, r1, 1);
__ vinserti32x4(T0, T0, T2, 3);
// Calculate R^2
__ movq(a0, r0);
__ movq(a1, r1); // "Clever": a2 not set because poly1305_multiply_scalar has a flag to indicate 128-bit accumulator
poly1305_multiply_scalar(a0, a1, a2,
r0, r1, c1, true,
t0, t1, t2, mulql, mulqh);
// Interleave the powers of R^1..R^4 to form 44-bit limbs (half-empty) // B0 to have bits 0-43 of all 4 blocks in alternating 8 qwords // B1 to have bits 87-44 of all 4 blocks in alternating 8 qwords // B2 to have bits 127-88 of all 4 blocks in alternating 8 qwords
__ vpxorq(T2, T2, T2, Assembler::AVX_512bit);
poly1305_limbs_avx512(T0, T2, B0, B1, B2, false, TMP, t1 /*rscratch*/);
// T1 contains the 2 highest bits of the powers of R
__ vpsllq(T1, T1, 40, Assembler::AVX_512bit);
__ evporq(B2, B2, T1, Assembler::AVX_512bit);
// Broadcast 44-bit limbs of R^4 into R0,R1,R2
__ mov(t0, a0);
__ andq(t0, ExternalAddress(poly1305_mask44()), t1 /*rscratch*/); // First limb (R^4[43:0])
__ evpbroadcastq(R0, t0, Assembler::AVX_512bit);
// Store R^8-R for later use
__ evmovdquq(C0, B0, Assembler::AVX_512bit);
__ evmovdquq(C1, B1, Assembler::AVX_512bit);
__ evmovdquq(C2, B2, Assembler::AVX_512bit);
// Store R^16-R^9 for later use
__ evmovdquq(C3, B0, Assembler::AVX_512bit);
__ evmovdquq(C4, B1, Assembler::AVX_512bit);
__ evmovdquq(C5, B2, Assembler::AVX_512bit);
// VECTOR LOOP: process 16 * 16-byte message block at a time
__ bind(L_process256Loop);
__ cmpl(length, 16*16);
__ jcc(Assembler::less, L_process256LoopDone);
// Load and interleave next block of data (128 bytes)
__ evmovdquq(D0, Address(input, 0), Assembler::AVX_512bit);
__ evmovdquq(D1, Address(input, 64), Assembler::AVX_512bit);
poly1305_limbs_avx512(D0, D1, B0, B1, B2, true, TMP, t1 /*rscratch*/);
// Load and interleave next block of data (128 bytes)
__ evmovdquq(D0, Address(input, 64*2), Assembler::AVX_512bit);
__ evmovdquq(D1, Address(input, 64*3), Assembler::AVX_512bit);
poly1305_limbs_avx512(D0, D1, B3, B4, B5, true, TMP, t1 /*rscratch*/);
__ vpaddq(A0, A0, B0, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator
__ vpaddq(A1, A1, B1, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator
__ vpaddq(A2, A2, B2, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator
__ vpaddq(A3, A3, B3, Assembler::AVX_512bit); // Add low 42-bit bits from new blocks to accumulator
__ vpaddq(A4, A4, B4, Assembler::AVX_512bit); // Add medium 42-bit bits from new blocks to accumulator
__ vpaddq(A5, A5, B5, Assembler::AVX_512bit); // Add highest bits from new blocks to accumulator
// Cleanup // Zero out zmm0-zmm31.
__ vzeroall(); for (XMMRegister rxmm = xmm16; rxmm->is_valid(); rxmm = rxmm->successor()) {
__ vpxorq(rxmm, rxmm, rxmm, Assembler::AVX_512bit);
}
}
// This function consumes as many whole 16-byte blocks as available in input // After execution, input and length will point at remaining (unprocessed) data // and accumulator will point to the current accumulator value
address StubGenerator::generate_poly1305_processBlocks() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "poly1305_processBlocks");
address start = __ pc();
__ enter();
// SCALAR LOOP: process one 16-byte message block at a time
__ bind(L_process16Loop);
__ cmpl(length, 16);
__ jcc(Assembler::less, L_process16LoopDone);
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.19Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.