/* * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// Return point for a Java call if there's an exception thrown in // Java code. The exception is caught and transformed into a // pending exception stored in JavaThread that can be tested from // within the VM. // // Note: Usually the parameters are removed by the callee. In case // of an exception crossing an activation frame boundary, that is // not the case if the callee is compiled code => need to setup the // rsp. // // rax: exception oop
// complete return to VM
assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
__ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
return start;
}
// Continuation point for runtime calls returning with a pending // exception. The pending exception check happened in the runtime // or native call stub. The pending exception in Thread is // converted into a Java-level exception. // // Contract with Java-level exception handlers: // rax: exception // rdx: throwing pc // // NOTE: At entry of this stub, exception-pc must be on stack !!
// Upon entry, the sp points to the return address returning into // Java (interpreted or compiled) code; i.e., the return address // becomes the throwing pc. // // Arguments pushed before the runtime call are still on the stack // but the exception handler will reset the stack pointer -> // ignore them. A potential result in registers can be ignored as // well.
#ifdef ASSERT // make sure this code is only executed if there is a pending exception
{
Label L;
__ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), NULL_WORD);
__ jcc(Assembler::notEqual, L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
} #endif
// Support for intptr_t get_previous_sp() // // This routine is used to find the previous stack pointer for the // caller.
address StubGenerator::generate_get_previous_sp() {
StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
address start = __ pc();
__ movptr(rax, rsp);
__ addptr(rax, 8); // return address is at the top of the stack.
__ ret(0);
return start;
}
//---------------------------------------------------------------------------------------------------- // Support for void verify_mxcsr() // // This routine is used with -Xcheck:jni to verify that native // JNI code does not return to Java code without restoring the // MXCSR register to our expected state.
// Before the call to MacroAssembler::debug(), see below.
return_addr = 16 * wordSize,
error_msg = 17 * wordSize
};
// get object
__ movptr(rax, Address(rsp, oop_to_verify));
// make sure object is 'reasonable'
__ testptr(rax, rax);
__ jcc(Assembler::zero, exit); // if obj is NULL it is OK
#if INCLUDE_ZGC if (UseZGC) { // Check if metadata bits indicate a bad oop
__ testptr(rax, Address(r15_thread, ZThreadLocalData::address_bad_mask_offset()));
__ jcc(Assembler::notZero, error);
} #endif
// Check if the oop is in the right area of memory
__ movptr(c_rarg2, rax);
__ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
__ andptr(c_rarg2, c_rarg3);
__ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
__ cmpptr(c_rarg2, c_rarg3);
__ jcc(Assembler::notZero, error);
// make sure klass is 'reasonable', which is not zero.
__ load_klass(rax, rax, rscratch1); // get klass
__ testptr(rax, rax);
__ jcc(Assembler::zero, error); // if klass is NULL it is broken
// return if everything seems ok
__ bind(exit);
__ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
__ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
__ pop(c_rarg3); // restore c_rarg3
__ pop(c_rarg2); // restore c_rarg2
__ pop(r12); // restore r12
__ popf(); // restore flags
__ ret(4 * wordSize); // pop caller saved stuff
// handle errors
__ bind(error);
__ movptr(rax, Address(rsp, saved_rax)); // get saved rax back
__ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
__ pop(c_rarg3); // get saved c_rarg3 back
__ pop(c_rarg2); // get saved c_rarg2 back
__ pop(r12); // get saved r12 back
__ popf(); // get saved flags off stack -- // will be ignored
__ pusha(); // push registers // (rip is already // already pushed) // debug(char* msg, int64_t pc, int64_t regs[]) // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and // pushed all the registers, so now the stack looks like: // [tos + 0] 16 saved registers // [tos + 16] return address // * [tos + 17] error message (char*) // * [tos + 18] object to verify (oop) // * [tos + 19] saved rax - saved by caller and bashed // * [tos + 20] saved r10 (rscratch1) - saved by caller // * = popped on exit
__ movptr(c_rarg0, Address(rsp, error_msg)); // pass address of error message
__ movptr(c_rarg1, Address(rsp, return_addr)); // pass return address
__ movq(c_rarg2, rsp); // pass address of regs on stack
__ mov(r12, rsp); // remember rsp
__ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
__ andptr(rsp, -16); // align stack as required by ABI
BLOCK_COMMENT("call MacroAssembler::debug");
__ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
__ hlt();
return start;
}
// Shuffle first three arg regs on Windows into Linux/Solaris locations. // // Outputs: // rdi - rcx // rsi - rdx // rdx - r8 // rcx - r9 // // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter // are non-volatile. r9 and r10 should not be used by the caller. // void StubGenerator::setup_arg_regs(int nargs) { constRegister saved_rdi = r9; constRegister saved_rsi = r10;
assert(nargs == 3 || nargs == 4, "else fix"); #ifdef _WIN64
assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, "unexpected argument registers"); if (nargs >= 4)
__ mov(rax, r9); // r9 is also saved_rdi
__ movptr(saved_rdi, rdi);
__ movptr(saved_rsi, rsi);
__ mov(rdi, rcx); // c_rarg0
__ mov(rsi, rdx); // c_rarg1
__ mov(rdx, r8); // c_rarg2 if (nargs >= 4)
__ mov(rcx, rax); // c_rarg3 (via rax) #else
assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx, "unexpected argument registers"); #endif
DEBUG_ONLY(_regs_in_thread = false;)
}
// This is used in places where r10 is a scratch register, and can // be adapted if r9 is needed also. void StubGenerator::setup_arg_regs_using_thread() { constRegister saved_r15 = r9; #ifdef _WIN64
__ mov(saved_r15, r15); // r15 is callee saved and needs to be restored
__ get_thread(r15_thread);
assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9, "unexpected argument registers");
__ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())), rdi);
__ movptr(Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())), rsi);
void StubGenerator::restore_arg_regs_using_thread() {
assert(_regs_in_thread, "wrong call to restore_arg_regs"); constRegister saved_r15 = r9; #ifdef _WIN64
__ get_thread(r15_thread);
__ movptr(rsi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rsi_offset())));
__ movptr(rdi, Address(r15_thread, in_bytes(JavaThread::windows_saved_rdi_offset())));
__ mov(r15, saved_r15); // r15 is callee saved and needs to be restored #endif
}
void StubGenerator::setup_argument_regs(BasicType type) { if (type == T_BYTE || type == T_SHORT) {
setup_arg_regs(); // from => rdi, to => rsi, count => rdx // r9 and r10 may be used to save non-volatile registers
} else {
setup_arg_regs_using_thread(); // from => rdi, to => rsi, count => rdx // r9 is used to save r15_thread
}
}
void StubGenerator::restore_argument_regs(BasicType type) { if (type == T_BYTE || type == T_SHORT) {
restore_arg_regs();
} else {
restore_arg_regs_using_thread();
}
}
// ofs and limit are use for multi-block byte array. // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
address StubGenerator::generate_sha1_implCompress(bool multi_block, constchar *name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
//Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
address StubGenerator::generate_pshuffle_byte_flip_mask_sha512() {
__ align32();
StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
address start = __ pc();
// Code for 512-bit VBMI encoding. Encodes 48 input bytes into 64 // output bytes. We read 64 input bytes and ignore the last 16, so be // sure not to read past the end of the input buffer. if (VM_Version::supports_avx512_vbmi()) {
__ cmpl(length, 64); // Do not overrun input buffer.
__ jcc(Assembler::below, L_not512);
__ shll(isURL, 6); // index into decode table based on isURL
__ lea(encode_table, ExternalAddress(StubRoutines::x86::base64_encoding_table_addr()));
__ addptr(encode_table, isURL);
__ shrl(isURL, 6); // restore isURL
// Put the input bytes into the proper lanes for writing, then // encode them.
__ evpmultishiftqb(xmm0, xmm1, xmm0, Assembler::AVX_512bit);
__ vpermb(xmm0, xmm0, xmm2, Assembler::AVX_512bit);
// Write to destination
__ evmovdquq(Address(dest, dp), xmm0, Assembler::AVX_512bit);
__ BIND(L_not512); if (VM_Version::supports_avx2()
&& VM_Version::supports_avx512vlbw()) { /* ** This AVX2 encoder is based off the paper at: ** https://dl.acm.org/doi/10.1145/3132709 ** ** We use AVX2 SIMD instructions to encode 24 bytes into 32 ** output bytes. **
*/ // Lengths under 32 bytes are done with scalar routine
__ cmpl(length, 31);
__ jcc(Assembler::belowEqual, L_process3);
// Set up supporting constant table data
__ vmovdqu(xmm9, ExternalAddress(StubRoutines::x86::base64_avx2_shuffle_addr()), rax); // 6-bit mask for 2nd and 4th (and multiples) 6-bit values
__ movl(rax, 0x0fc0fc00);
__ vmovdqu(xmm1, ExternalAddress(StubRoutines::x86::base64_avx2_input_mask_addr()), rax);
__ evpbroadcastd(xmm8, rax, Assembler::AVX_256bit);
// Multiplication constant for "shifting" right by 6 and 10 // bits
__ movl(rax, 0x04000040);
// For the first load, we mask off reading of the first 4 // bytes into the register. This is so we can get 4 3-byte // chunks into each lane of the register, avoiding having to // handle end conditions. We then shuffle these bytes into a // specific order so that manipulation is easier. // // The initial read loads the XMM register like this: // // Lower 128-bit lane: // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ // | XX | XX | XX | XX | A0 | A1 | A2 | B0 | B1 | B2 | C0 | C1 // | C2 | D0 | D1 | D2 | // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ // // Upper 128-bit lane: // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ // | E0 | E1 | E2 | F0 | F1 | F2 | G0 | G1 | G2 | H0 | H1 | H2 // | XX | XX | XX | XX | // +----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+ // // Where A0 is the first input byte, B0 is the fourth, etc. // The alphabetical significance denotes the 3 bytes to be // consumed and encoded into 4 bytes. // // We then shuffle the register so each 32-bit word contains // the sequence: // A1 A0 A2 A1, B1, B0, B2, B1, etc. // Each of these byte sequences are then manipulated into 4 // 6-bit values ready for encoding. // // If we focus on one set of 3-byte chunks, changing the // nomenclature such that A0 => a, A1 => b, and A2 => c, we // shuffle such that each 24-bit chunk contains: // // b7 b6 b5 b4 b3 b2 b1 b0 | a7 a6 a5 a4 a3 a2 a1 a0 | c7 c6 // c5 c4 c3 c2 c1 c0 | b7 b6 b5 b4 b3 b2 b1 b0 // Explain this step. // b3 b2 b1 b0 c5 c4 c3 c2 | c1 c0 d5 d4 d3 d2 d1 d0 | a5 a4 // a3 a2 a1 a0 b5 b4 | b3 b2 b1 b0 c5 c4 c3 c2 // // W first and off all but bits 4-9 and 16-21 (c5..c0 and // a5..a0) and shift them using a vector multiplication // operation (vpmulhuw) which effectively shifts c right by 6 // bits and a right by 10 bits. We similarly mask bits 10-15 // (d5..d0) and 22-27 (b5..b0) and shift them left by 8 and 4 // bits respectively. This is done using vpmullw. We end up // with 4 6-bit values, thus splitting the 3 input bytes, // ready for encoding: // 0 0 d5..d0 0 0 c5..c0 0 0 b5..b0 0 0 a5..a0 // // For translation, we recognize that there are 5 distinct // ranges of legal Base64 characters as below: // // +-------------+-------------+------------+ // | 6-bit value | ASCII range | offset | // +-------------+-------------+------------+ // | 0..25 | A..Z | 65 | // | 26..51 | a..z | 71 | // | 52..61 | 0..9 | -4 | // | 62 | + or - | -19 or -17 | // | 63 | / or _ | -16 or 32 | // +-------------+-------------+------------+ // // We note that vpshufb does a parallel lookup in a // destination register using the lower 4 bits of bytes from a // source register. If we use a saturated subtraction and // subtract 51 from each 6-bit value, bytes from [0,51] // saturate to 0, and [52,63] map to a range of [1,12]. We // distinguish the [0,25] and [26,51] ranges by assigning a // value of 13 for all 6-bit values less than 26. We end up // with: // // +-------------+-------------+------------+ // | 6-bit value | Reduced | offset | // +-------------+-------------+------------+ // | 0..25 | 13 | 65 | // | 26..51 | 0 | 71 | // | 52..61 | 0..9 | -4 | // | 62 | 11 | -19 or -17 | // | 63 | 12 | -16 or 32 | // +-------------+-------------+------------+ // // We then use a final vpshufb to add the appropriate offset, // translating the bytes. // // Load input bytes - only 28 bytes. Mask the first load to // not load into the full register.
__ vpmaskmovd(xmm1, xmm1, Address(source, start_offset, Address::times_1, -4), Assembler::AVX_256bit);
// Move 3-byte chunks of input (12 bytes) into 16 bytes, // ordering by: // 1, 0, 2, 1; 4, 3, 5, 4; etc. This groups 6-bit chunks // for easy masking
__ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
__ addl(start_offset, 24);
// Load masking register for first and third (and multiples) // 6-bit values.
__ movl(rax, 0x003f03f0);
__ evpbroadcastd(xmm6, rax, Assembler::AVX_256bit); // Multiplication constant for "shifting" left by 4 and 8 bits
__ movl(rax, 0x01000010);
__ evpbroadcastd(xmm5, rax, Assembler::AVX_256bit);
// Shift output bytes 0 and 2 into proper lanes
__ vpmulhuw(xmm2, xmm0, xmm7, Assembler::AVX_256bit);
// Mask and shift output bytes 1 and 3 into proper lanes and // combine
__ vpand(xmm0, xmm6, xmm1, Assembler::AVX_256bit);
__ vpmullw(xmm0, xmm5, xmm0, Assembler::AVX_256bit);
__ vpor(xmm0, xmm0, xmm2, Assembler::AVX_256bit);
// Find out which are 0..25. This indicates which input // values fall in the range of 'A'-'Z', which require an // additional offset (see comments above)
__ vpcmpgtb(xmm2, xmm0, xmm3, Assembler::AVX_256bit);
__ vpsubusb(xmm1, xmm0, xmm4, Assembler::AVX_256bit);
__ vpsubb(xmm1, xmm1, xmm2, Assembler::AVX_256bit);
// Shuffle the offsets based on the range calculation done // above. This allows us to add the correct offset to the // 6-bit value corresponding to the range documented above.
__ vpshufb(xmm1, xmm2, xmm1, Assembler::AVX_256bit);
__ vpaddb(xmm0, xmm1, xmm0, Assembler::AVX_256bit);
// Store the encoded bytes
__ vmovdqu(Address(dest, dp), xmm0);
__ addl(dp, 32);
// Get next 32 bytes
__ vmovdqu(xmm1, Address(source, start_offset, Address::times_1, -4));
__ subl(length, 24);
__ addl(start_offset, 24);
// This logic is identical to the above, with only constant // register loads removed. Shuffle the input, mask off 6-bit // chunks, shift them into place, then add the offset to // encode.
__ vpshufb(xmm1, xmm1, xmm9, Assembler::AVX_256bit);
__ orl(rax, r13); // At this point, rax contains | byte1 | byte2 | byte0 | byte1 // r13 has byte2 << 16 - need low-order 6 bits to translate. // This translated byte is the fourth output byte.
__ shrl(r13, 16);
__ andl(r13, 0x3f);
// The high-order 6 bits of r15 (byte0) is translated. // The translated byte is the first output byte.
__ shrl(r15, 10);
// Extract high-order 4 bits of byte1 and low-order 2 bits of byte0. // This translated byte is the second output byte.
__ shrl(rax, 4);
__ movl(r10, rax);
__ andl(rax, 0x3f);
// Extract low-order 2 bits of byte1 and high-order 4 bits of byte2. // This translated byte is the third output byte.
__ shrl(r10, 18);
__ andl(r10, 0x3f);
// Copy the low part of the lookup table into the destination of the permutation
__ evmovdquq(translated0, lookup_lo, Assembler::AVX_512bit);
__ evmovdquq(translated1, lookup_lo, Assembler::AVX_512bit);
__ evmovdquq(translated2, lookup_lo, Assembler::AVX_512bit);
__ evmovdquq(translated3, lookup_lo, Assembler::AVX_512bit);
// OR all of the translations together to check for errors (high-order bit of byte set)
__ vpternlogd(input0, 0xfe, input1, input2, Assembler::AVX_512bit);
// Check if there was an error - if so, try 64-byte chunks
__ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
__ kortestql(k3, k3);
__ jcc(Assembler::notZero, L_process64);
// The merging and shuffling happens here // We multiply each byte pair [00dddddd | 00cccccc | 00bbbbbb | 00aaaaaa] // Multiply [00cccccc] by 2^6 added to [00dddddd] to get [0000cccc | ccdddddd] // The pack16_op is a vector of 0x01400140, so multiply D by 1 and C by 0x40
__ vpmaddubsw(merge_ab_bc0, translated0, pack16_op, Assembler::AVX_512bit);
__ vpmaddubsw(merge_ab_bc1, translated1, pack16_op, Assembler::AVX_512bit);
__ vpmaddubsw(merge_ab_bc2, translated2, pack16_op, Assembler::AVX_512bit);
__ vpmaddubsw(merge_ab_bc3, translated3, pack16_op, Assembler::AVX_512bit);
// Now do the same with packed 16-bit values. // We start with [0000cccc | ccdddddd | 0000aaaa | aabbbbbb] // pack32_op is 0x00011000 (2^12, 1), so this multiplies [0000aaaa | aabbbbbb] by 2^12 // and adds [0000cccc | ccdddddd] to yield [00000000 | aaaaaabb | bbbbcccc | ccdddddd]
__ vpmaddwd(merged0, merge_ab_bc0, pack32_op, Assembler::AVX_512bit);
__ vpmaddwd(merged1, merge_ab_bc1, pack32_op, Assembler::AVX_512bit);
__ vpmaddwd(merged2, merge_ab_bc2, pack32_op, Assembler::AVX_512bit);
__ vpmaddwd(merged3, merge_ab_bc3, pack32_op, Assembler::AVX_512bit);
// The join vectors specify which byte from which vector goes into the outputs // One of every 4 bytes in the extended vector is zero, so we pack them into their // final positions in the register for storing (256 bytes in, 192 bytes out)
__ evpermt2b(merged0, join01, merged1, Assembler::AVX_512bit);
__ evpermt2b(merged1, join12, merged2, Assembler::AVX_512bit);
__ evpermt2b(merged2, join23, merged3, Assembler::AVX_512bit);
// At this point, we've decoded 64 * 4 * n bytes. // The remaining length will be <= 64 * 4 - 1. // UNLESS there was an error decoding the first 256-byte chunk. In this // case, the length will be arbitrarily long. // // Note that this will be the path for MIME-encoded strings.
// Check for error and bomb out before updating dest
__ evpmovb2m(k3, errorvec, Assembler::AVX_512bit);
__ kortestql(k3, k3);
__ jcc(Assembler::notZero, L_exit);
__ BIND(L_finalBit); // Now have 1 to 63 bytes left to decode
// I was going to let Java take care of the final fragment // however it will repeatedly call this routine for every 4 bytes // of input data, so handle the rest here.
__ movq(rax, -1);
__ bzhiq(rax, rax, length); // Input mask in rax
// Strip pad characters, if any, and adjust length and mask
__ cmpb(Address(source, length, Address::times_1, -1), '=');
__ jcc(Assembler::equal, L_padding);
// Load initial input with all valid base64 characters. Will be used // in merging source bytes to avoid masking when determining if an error occurred.
__ movl(rax, 0x61616161);
__ evpbroadcastd(input_initial_valid_b64, rax, Assembler::AVX_512bit);
// A register containing all invalid base64 decoded values
__ movl(rax, 0x80808080);
__ evpbroadcastd(invalid_b64, rax, Assembler::AVX_512bit);
// input_mask is in k1 // output_size is in r13 // output_mask is in r15 // zmm0 - free // zmm1 - 0x00011000 // zmm2 - 0x01400140 // zmm3 - errorvec // zmm4 - pack vector // zmm5 - lookup_lo // zmm6 - lookup_hi // zmm7 - errorvec // zmm8 - 0x61616161 // zmm9 - 0x80808080
// Load only the bytes from source, merging into our "fully-valid" register
__ evmovdqub(input_initial_valid_b64, input_mask, Address(source, start_offset, Address::times_1, 0x0), true, Assembler::AVX_512bit);
// Check for error. Compare (decoded | initial) to all invalid. // If any bytes have their high-order bit set, then we have an error.
__ evptestmb(k2, mask, invalid_b64, Assembler::AVX_512bit);
__ kortestql(k2, k2);
// If we have an error, use the brute force loop to decode what we can (4-byte chunks).
__ jcc(Assembler::notZero, L_bruteForce);
__ shrl(length, 2); // Multiple of 4 bytes only - length is # 4-byte chunks
__ cmpl(length, 0);
__ jcc(Assembler::lessEqual, L_exit_no_vzero);
__ shll(isURL, 8); // index into decode table based on isURL
__ lea(decode_table, ExternalAddress(StubRoutines::x86::base64_decoding_table_addr()));
__ addptr(decode_table, isURL);
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
if (VM_Version::supports_sse4_1() && VM_Version::supports_avx512_vpclmulqdq() &&
VM_Version::supports_avx512bw() &&
VM_Version::supports_avx512vl()) { // The constants used in the CRC32 algorithm requires the 1's compliment of the initial crc value. // However, the constant table for CRC32-C assumes the original crc value. Account for this // difference before calling and after returning.
__ lea(table, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr()));
__ notl(crc);
__ kernel_crc32_avx512(crc, buf, len, table, tmp1, tmp2);
__ notl(crc);
} else {
__ kernel_crc32(crc, buf, len, table, tmp1);
}
__ movl(rax, crc);
__ vzeroupper();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
return start;
}
/** * Arguments: * * Inputs: * c_rarg0 - int crc * c_rarg1 - byte* buf * c_rarg2 - long length * c_rarg3 - table_start - optional (present only when doing a library_call, * not used by x86 algorithm) * * Output: * rax - int crc result
*/
address StubGenerator::generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
assert(UseCRC32CIntrinsics, "need SSE4_2");
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
address start = __ pc();
//reg.arg int#0 int#1 int#2 int#3 int#4 int#5 float regs //Windows RCX RDX R8 R9 none none XMM0..XMM3 //Lin / Sol RDI RSI RDX RCX R8 R9 XMM0..XMM7 constRegister crc = c_rarg0; // crc constRegister buf = c_rarg1; // source java byte array address constRegister len = c_rarg2; // length constRegister a = rax; constRegister j = r9; constRegister k = r10; constRegister l = r11; #ifdef _WIN64 constRegister y = rdi; constRegister z = rsi; #else constRegister y = rcx; constRegister z = r8; #endif
assert_different_registers(crc, buf, len, a, j, k, l, y, z);
// Next registers will be saved on stack in multiply_to_len(). constRegister tmp1 = r12; constRegister tmp2 = r13; constRegister tmp3 = r14; constRegister tmp4 = r15; constRegister tmp5 = rbx;
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifndef _WIN64
__ movptr(zlen, r9); // Save r9 in r11 - zlen #endif
setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx // ylen => rcx, z => r8, zlen => r11 // r9 and r10 may be used to save non-volatile registers #ifdef _WIN64 // last 2 arguments (#4, #5) are on stack on Win64
__ movptr(z, Address(rsp, 6 * wordSize));
__ movptr(zlen, Address(rsp, 7 * wordSize)); #endif
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
setup_arg_regs(4); // x => rdi, len => rsi, z => rdx // zlen => rcx // r9 and r10 may be used to save non-volatile registers
__ movptr(r8, rdx);
__ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
restore_arg_regs();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
__ push(-1); // cookie, this is used for writing the new rsp when deoptimizing
BLOCK_COMMENT("Entry:");
__ enter(); // save rbp
// save c_rarg0, because we want to use that value. // We could do without it but then we depend on the number of slots used by pusha
__ push(c_rarg0);
__ lea(c_rarg0, Address(rsp, wordSize * 3)); // 1 for cookie, 1 for rbp, 1 for c_rarg0 - this should be the return address
__ pusha();
// The method may have floats as arguments, and we must spill them before calling // the VM runtime.
assert(Argument::n_float_register_parameters_j == 8, "Assumption"); constint xmm_size = wordSize * 2; constint xmm_spill_size = xmm_size * Argument::n_float_register_parameters_j;
__ subptr(rsp, xmm_spill_size);
__ movdqu(Address(rsp, xmm_size * 7), xmm7);
__ movdqu(Address(rsp, xmm_size * 6), xmm6);
__ movdqu(Address(rsp, xmm_size * 5), xmm5);
__ movdqu(Address(rsp, xmm_size * 4), xmm4);
__ movdqu(Address(rsp, xmm_size * 3), xmm3);
__ movdqu(Address(rsp, xmm_size * 2), xmm2);
__ movdqu(Address(rsp, xmm_size * 1), xmm1);
__ movdqu(Address(rsp, xmm_size * 0), xmm0);
// this can be taken out, but is good for verification purposes. getting a SIGSEGV // here while still having a correct stack is valuable
__ testptr(rsp, Address(rsp, 0));
__ movptr(rsp, Address(rsp, 0)); // new rsp was written in the barrier
__ jmp(Address(rsp, -1 * wordSize)); // jmp target should be callers verified_entry_point
return start;
}
/** * Arguments: * * Input: * c_rarg0 - out address * c_rarg1 - in address * c_rarg2 - offset * c_rarg3 - len * not Win64 * c_rarg4 - k * Win64 * rsp+40 - k
*/
address StubGenerator::generate_mulAdd() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "mulAdd");
address start = __ pc();
// Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...) // Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...) constRegister out = rdi; constRegister in = rsi; constRegister offset = r11; constRegister len = rcx; constRegister k = r8;
// Next registers will be saved on stack in mul_add(). constRegister tmp1 = r12; constRegister tmp2 = r13; constRegister tmp3 = r14; constRegister tmp4 = r15; constRegister tmp5 = rbx;
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx // len => rcx, k => r8 // r9 and r10 may be used to save non-volatile registers #ifdef _WIN64 // last argument is on stack on Win64
__ movl(k, Address(rsp, 6 * wordSize)); #endif
__ movptr(r11, rdx); // move offset in rdx to offset(r11)
__ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
restore_arg_regs();
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit; // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8. constRegister newArr = rdi; constRegister oldArr = rsi; constRegister newIdx = rdx; constRegister shiftCount = rcx; // It was intentional to have shiftCount in rcx since it is used implicitly for shift. constRegister totalNumIter = r8;
// For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps. // For everything else, we prefer using r9 and r10 since we do not have to save them before use. constRegister tmp1 = r11; // Caller save. constRegister tmp2 = rax; // Caller save. constRegister tmp3 = WIN64_ONLY(r12) NOT_WIN64(r9); // Windows: Callee save. Linux: Caller save. constRegister tmp4 = WIN64_ONLY(r13) NOT_WIN64(r10); // Windows: Callee save. Linux: Caller save. constRegister tmp5 = r14; // Callee save. constRegister tmp6 = r15;
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef _WIN64
setup_arg_regs(4); // For windows, since last argument is on stack, we need to move it to the appropriate register.
__ movl(totalNumIter, Address(rsp, 6 * wordSize)); // Save callee save registers.
__ push(tmp3);
__ push(tmp4); #endif
__ push(tmp5);
// Rename temps used throughout the code. constRegister idx = tmp1; constRegister nIdx = tmp2;
__ xorl(idx, idx);
// Start right shift from end of the array. // For example, if #iteration = 4 and newIdx = 1 // then dest[4] = src[4] >> shiftCount | src[3] <<< (shiftCount - 32) // if #iteration = 4 and newIdx = 0 // then dest[3] = src[4] >> shiftCount | src[3] <<< (shiftCount - 32)
__ movl(idx, totalNumIter);
__ movl(nIdx, idx);
__ addl(nIdx, newIdx);
// If vectorization is enabled, check if the number of iterations is at least 64 // If not, then go to ShifTwo processing 2 iterations if (VM_Version::supports_avx512_vbmi2()) {
__ cmpptr(totalNumIter, (AVX3Threshold/64));
__ jcc(Assembler::less, ShiftTwo);
Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit; // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8. constRegister newArr = rdi; constRegister oldArr = rsi; constRegister newIdx = rdx; constRegister shiftCount = rcx; // It was intentional to have shiftCount in rcx since it is used implicitly for shift. constRegister totalNumIter = r8; // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps. // For everything else, we prefer using r9 and r10 since we do not have to save them before use. constRegister tmp1 = r11; // Caller save. constRegister tmp2 = rax; // Caller save. constRegister tmp3 = WIN64_ONLY(r12) NOT_WIN64(r9); // Windows: Callee save. Linux: Caller save. constRegister tmp4 = WIN64_ONLY(r13) NOT_WIN64(r10); // Windows: Callee save. Linux: Caller save. constRegister tmp5 = r14; // Callee save.
#ifdef _WIN64
setup_arg_regs(4); // For windows, since last argument is on stack, we need to move it to the appropriate register.
__ movl(totalNumIter, Address(rsp, 6 * wordSize)); // Save callee save registers.
__ push(tmp3);
__ push(tmp4); #endif
__ push(tmp5);
// Rename temps used throughout the code constRegister idx = tmp1; constRegister numIterTmp = tmp2;
// Start idx from zero.
__ xorl(idx, idx); // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays.
__ lea(newArr, Address(newArr, newIdx, Address::times_4));
__ movl(numIterTmp, totalNumIter);
// If vectorization is enabled, check if the number of iterations is at least 64 // If not, then go to ShiftTwo shifting two numbers at a time if (VM_Version::supports_avx512_vbmi2()) {
__ cmpl(totalNumIter, (AVX3Threshold/64));
__ jcc(Assembler::less, ShiftTwo);
// TODO: Handle Valhalla return types. May require generating different return barriers.
if (!return_barrier) { // Pop return address. If we don't do this, we get a drift, // where the bottom-most frozen frame continuously grows.
__ pop(c_rarg3);
} else {
__ movptr(rsp, Address(r15_thread, JavaThread::cont_entry_offset()));
}
if (return_barrier) { // Restore return value from a method returning to the return barrier. // No safepoint in the call to thaw, so even an oop return value should be OK.
__ pop_d(xmm0);
__ pop(rax);
}
// rbx contains the size of the frames to thaw, 0 if overflow or no more frames
Label L_thaw_success;
__ testptr(rbx, rbx);
__ jccb(Assembler::notZero, L_thaw_success);
__ jump(ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
__ bind(L_thaw_success);
// Make room for the thawed frames and align the stack.
__ subptr(rsp, rbx);
__ andptr(rsp, -StackAlignmentInBytes);
if (return_barrier) { // Preserve possible return value from a method returning to the return barrier. (Again.)
__ push(rax);
__ push_d(xmm0);
}
// If we want, we can templatize thaw by kind, and have three different entries.
__ movptr(c_rarg0, r15_thread);
__ movptr(c_rarg1, kind);
__ call_VM_leaf(Continuation::thaw_entry(), 2);
__ movptr(rbx, rax);
if (return_barrier) { // Restore return value from a method returning to the return barrier. (Again.) // No safepoint in the call to thaw, so even an oop return value should be OK.
__ pop_d(xmm0);
__ pop(rax);
} else { // Return 0 (success) from doYield.
__ xorptr(rax, rax);
}
// After thawing, rbx is the SP of the yielding frame. // Move there, and then to saved RBP slot.
__ movptr(rsp, rbx);
__ subptr(rsp, 2*wordSize);
// Continue at exception handler: // rax: exception oop // rbx: exception handler // rdx: exception pc
__ pop(rax);
__ verify_oop(rax);
__ pop(rbp); // pop out RBP here too
__ pop(rdx);
__ jmp(rbx);
} else { // We are "returning" into the topmost thawed frame; see Thaw::push_return_frame
__ pop(rbp);
__ ret(0);
}
// For c2: c_rarg0 is junk, call to runtime to write a checkpoint. // It returns a jobject handle to the event writer. // The handle is dereferenced and the return value is the event writer oop.
RuntimeStub* StubGenerator::generate_jfr_write_checkpoint() { enum layout {
rbp_off,
rbpH_off,
return_off,
return_off2,
framesize // inclusive of return address
};
// rax is jobject handle result, unpack and process it through a barrier.
Label L_null_jobject;
__ testptr(rax, rax);
__ jcc(Assembler::zero, L_null_jobject);
// Continuation point for throwing of implicit exceptions that are // not handled in the current activation. Fabricates an exception // oop and initiates normal exception dispatching in this // frame. Since we need to preserve callee-saved values (currently // only for C2, but done for C1 as well) we need a callee-saved oop // map and therefore have to make these stubs into RuntimeStubs // rather than BufferBlobs. If the compiler needs all registers to // be preserved between the fault point and the exception handler // then it must assume responsibility for that in // AbstractCompiler::continuation_for_implicit_null_exception or // continuation_for_implicit_division_by_zero_exception. All other // implicit exceptions (e.g., NullPointerException or // AbstractMethodError on entry) are either at call sites or // otherwise assume that stack unwinding will be initiated, so // caller saved registers were assumed volatile in the compiler.
address StubGenerator::generate_throw_exception(constchar* name,
address runtime_entry, Register arg1, Register arg2) { // Information about frame layout at time of blocking runtime call. // Note that we only have to preserve callee-saved registers since // the compilers are responsible for supplying a continuation point // if they expect all registers to be preserved. enum layout {
rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
rbp_off2,
return_off,
return_off2,
framesize // inclusive of return address
};
int insts_size = 512; int locs_size = 64;
CodeBuffer code(name, insts_size, locs_size);
OopMapSet* oop_maps = new OopMapSet();
MacroAssembler* _masm = new MacroAssembler(&code);
address start = __ pc();
// This is an inlined and slightly modified version of call_VM // which has the ability to fetch the return PC out of // thread-local storage and also sets up last_Java_sp slightly // differently than the real call_VM
__ enter(); // required for proper stackwalking of RuntimeStub frame
assert(is_even(framesize/2), "sp not 16-byte aligned");
// return address and rbp are already in place
__ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
int frame_complete = __ pc() - start;
// Set up last_Java_sp and last_Java_fp
address the_pc = __ pc();
__ set_last_Java_frame(rsp, rbp, the_pc, rscratch1);
__ andptr(rsp, -(StackAlignmentInBytes)); // Align stack
// Initialization void StubGenerator::generate_initial() { // Generates all stubs and initializes the entry points
// This platform-specific settings are needed by generate_call_stub()
create_control_words();
// entry points that exist in all platforms Note: This is code // that could be shared among different platforms - however the // benefit seems to be smaller than the disadvantage of having a // much more complicated generator structure. See also comment in // stubRoutines.hpp.
// Build this early so it's available for the interpreter.
StubRoutines::_throw_StackOverflowError_entry =
generate_throw_exception("StackOverflowError throw_exception",
CAST_FROM_FN_PTR(address,
SharedRuntime::
throw_StackOverflowError));
StubRoutines::_throw_delayed_StackOverflowError_entry =
generate_throw_exception("delayed StackOverflowError throw_exception",
CAST_FROM_FN_PTR(address,
SharedRuntime::
throw_delayed_StackOverflowError)); if (UseCRC32Intrinsics) { // set table address before stub generation which use it
StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
}
if (UsePoly1305Intrinsics) {
StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
}
void StubGenerator::generate_all() { // Generates all stubs and initializes the entry points
// These entry points require SharedInfo::stack0 to be set up in // non-core builds and need to be relocatable, so they each // fabricate a RuntimeStub internally.
StubRoutines::_throw_AbstractMethodError_entry =
generate_throw_exception("AbstractMethodError throw_exception",
CAST_FROM_FN_PTR(address,
SharedRuntime::
throw_AbstractMethodError));
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); if (bs_nm != NULL) {
StubRoutines::x86::_method_entry_barrier = generate_method_entry_barrier();
} #ifdef COMPILER2 if (UseMultiplyToLenIntrinsic) {
StubRoutines::_multiplyToLen = generate_multiplyToLen();
} if (UseSquareToLenIntrinsic) {
StubRoutines::_squareToLen = generate_squareToLen();
} if (UseMulAddIntrinsic) {
StubRoutines::_mulAdd = generate_mulAdd();
} if (VM_Version::supports_avx512_vbmi2()) {
StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
} if (UseMontgomeryMultiplyIntrinsic) {
StubRoutines::_montgomeryMultiply
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
} if (UseMontgomerySquareIntrinsic) {
StubRoutines::_montgomerySquare
= CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
}
// Get svml stub routine addresses void *libjsvml = NULL; char ebuf[1024]; char dll_name[JVM_MAXPATHLEN]; if (os::dll_locate_lib(dll_name, sizeof(dll_name), Arguments::get_dll_dir(), "jsvml")) {
libjsvml = os::dll_load(dll_name, ebuf, sizeof ebuf);
} if (libjsvml != NULL) { // SVML method naming convention // All the methods are named as __jsvml_op<T><N>_ha_<VV> // Where: // ha stands for high accuracy // <T> is optional to indicate float/double // Set to f for vector float operation // Omitted for vector double operation // <N> is the number of elements in the vector // 1, 2, 4, 8, 16 // e.g. 128 bit float vector has 4 float elements // <VV> indicates the avx/sse level: // z0 is AVX512, l9 is AVX2, e9 is AVX1 and ex is for SSE2 // e.g. __jsvml_expf16_ha_z0 is the method for computing 16 element vector float exp using AVX 512 insns // __jsvml_exp8_ha_z0 is the method for computing 8 element vector double exp using AVX 512 insns
log_info(library)("Loaded library %s, handle " INTPTR_FORMAT, JNI_LIB_PREFIX "jsvml" JNI_LIB_SUFFIX, p2i(libjsvml)); if (UseAVX > 2) { for (int op = 0; op < VectorSupport::NUM_SVML_OP; op++) { int vop = VectorSupport::VECTOR_OP_SVML_START + op; if ((!VM_Version::supports_avx512dq()) &&
(vop == VectorSupport::VECTOR_OP_LOG || vop == VectorSupport::VECTOR_OP_LOG10 || vop == VectorSupport::VECTOR_OP_POW)) { continue;
}
snprintf(ebuf, sizeof(ebuf), "__jsvml_%sf16_ha_z0", VectorSupport::svmlname[op]);
StubRoutines::_vector_f_math[VectorSupport::VEC_SIZE_512][op] = (address)os::dll_lookup(libjsvml, ebuf);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.