/* * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
void MacroAssembler::extend_sign(Register hi, Register lo) { // According to Intel Doc. AP-526, "Integer Divide", p.18. if (VM_Version::is_P6() && hi == rdx && lo == rax) {
cdql();
} else {
movl(hi, lo);
sarl(hi, 31);
}
}
void MacroAssembler::jC2(Register tmp, Label& L) { // set parity bit if FPU flag C2 is set (via rax)
save_rax(tmp);
fwait(); fnstsw_ax();
sahf();
restore_rax(tmp); // branch
jcc(Assembler::parity, L);
}
void MacroAssembler::jnC2(Register tmp, Label& L) { // set parity bit if FPU flag C2 is set (via rax)
save_rax(tmp);
fwait(); fnstsw_ax();
sahf();
restore_rax(tmp); // branch
jcc(Assembler::noParity, L);
}
// 32bit can do a case table jump in one instruction but we no longer allow the base // to be installed in the Address class void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
assert(rscratch == noreg, "not needed");
jmp(as_Address(entry, noreg));
}
// Note: y_lo will be destroyed void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { // Long compare for Java (semantics as described in JVM spec.)
Label high, low, done;
cmpl(x_hi, y_hi);
jcc(Assembler::less, low);
jcc(Assembler::greater, high); // x_hi is the return register
xorl(x_hi, x_hi);
cmpl(x_lo, y_lo);
jcc(Assembler::below, low);
jcc(Assembler::equal, done);
void MacroAssembler::lshl(Register hi, Register lo) { // Java shift left long support (semantics as described in JVM spec., p.305) // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) // shift value is in rcx !
assert(hi != rcx, "must not use rcx");
assert(lo != rcx, "must not use rcx"); constRegister s = rcx; // shift count constint n = BitsPerWord;
Label L;
andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
cmpl(s, n); // if (s < n)
jcc(Assembler::less, L); // else (s >= n)
movl(hi, lo); // x := x << n
xorl(lo, lo); // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
bind(L); // s (mod n) < n
shldl(hi, lo); // x := x << s
shll(lo);
}
void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) { // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n))
assert(hi != rcx, "must not use rcx");
assert(lo != rcx, "must not use rcx"); constRegister s = rcx; // shift count constint n = BitsPerWord;
Label L;
andl(s, 0x3f); // s := s & 0x3f (s < 0x40)
cmpl(s, n); // if (s < n)
jcc(Assembler::less, L); // else (s >= n)
movl(lo, hi); // x := x >> n if (sign_extension) sarl(hi, 31); else xorl(hi, hi); // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n!
bind(L); // s (mod n) < n
shrdl(lo, hi); // x := x >> s if (sign_extension) sarl(hi); else shrl(hi);
}
void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, intrax, int eip, char* msg) { // In order to get locks to work, we need to fake a in_VM state
JavaThread* thread = JavaThread::current();
JavaThreadState saved_state = thread->thread_state();
thread->set_thread_state(_thread_in_vm); if (ShowMessageBoxOnError) {
JavaThread* thread = JavaThread::current();
JavaThreadState saved_state = thread->thread_state();
thread->set_thread_state(_thread_in_vm); if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
ttyLocker ttyl;
BytecodeCounter::print();
} // To see where a verify_oop failed, get $ebx+40/X for this frame. // This is the value of eip which points to where verify_oop will return. if (os::message_box(msg, "Execution stopped, print registers?")) {
print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip);
BREAKPOINT;
}
}
fatal("DEBUG MESSAGE: %s", msg);
}
void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) {
ttyLocker ttyl;
FlagSetting fs(Debugging, true);
tty->print_cr("eip = 0x%08x", eip); #ifndef PRODUCT if ((WizardMode || Verbose) && PrintMiscellaneous) {
tty->cr();
findpc(eip);
tty->cr();
} #endif #define PRINT_REG(rax) \
{ tty->print("%s = ", #rax); os::print_location(tty, rax); }
PRINT_REG(rax);
PRINT_REG(rbx);
PRINT_REG(rcx);
PRINT_REG(rdx);
PRINT_REG(rdi);
PRINT_REG(rsi);
PRINT_REG(rbp);
PRINT_REG(rsp); #undef PRINT_REG // Print some words near top of staack. int* dump_sp = (int*) rsp; for (int col1 = 0; col1 < 8; col1++) {
tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
os::print_location(tty, *dump_sp++);
} for (int row = 0; row < 16; row++) {
tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); for (int col = 0; col < 8; col++) {
tty->print(" 0x%08x", *dump_sp++);
}
tty->cr();
} // Print some instructions around pc:
Disassembler::decode((address)eip-64, (address)eip);
tty->print_cr("--------");
Disassembler::decode((address)eip, (address)eip+32);
}
Address MacroAssembler::as_Address(AddressLiteral adr) { // amd64 always does this as a pc-rel // we can be absolute or disp based on the instruction type // jmp/call are displacements others are absolute
assert(!adr.is_lval(), "must be rval");
assert(reachable(adr), "must be"); return Address(checked_cast<int32_t>(adr.target() - pc()), adr.target(), adr.reloc());
}
Address MacroAssembler::as_Address(ArrayAddress adr, Register rscratch) {
AddressLiteral base = adr.base();
lea(rscratch, base);
Address index = adr.index();
assert(index._disp == 0, "must not have disp"); // maybe it can?
Address array(rscratch, index._index, index._scale, index._disp); return array;
}
void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) {
Label L, E;
#ifdef _WIN64 // Windows always allocates space for it's register args
assert(num_args <= 4, "only register arguments supported");
subq(rsp, frame::arg_reg_save_area_bytes); #endif
// Align stack if necessary
testl(rsp, 15);
jcc(Assembler::zero, L);
int MacroAssembler::corrected_idivq(Register reg) { // Full implementation of Java ldiv and lrem; checks for special // case as described in JVM spec., p.243 & p.271. The function // returns the (pc) offset of the idivl instruction - may be needed // for implicit exceptions. // // normal case special case // // input : rax: dividend min_long // reg: divisor (may not be eax/edx) -1 // // output: rax: quotient (= rax idiv reg) min_long // rdx: remainder (= rax irem reg) 0
assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register"); staticconst int64_t min_long = 0x8000000000000000;
Label normal_case, special_case;
// check for special case
cmp64(rax, ExternalAddress((address) &min_long), rdx /*rscratch*/);
jcc(Assembler::notEqual, normal_case);
xorl(rdx, rdx); // prepare rdx for possible special case (where // remainder = 0)
cmpq(reg, -1);
jcc(Assembler::equal, special_case);
// handle normal case
bind(normal_case);
cdqq(); int idivq_offset = offset();
idivq(reg);
// normal and special case exit
bind(special_case);
// 32bit can do a case table jump in one instruction but we no longer allow the base // to be installed in the Address class void MacroAssembler::jump(ArrayAddress entry, Register rscratch) {
lea(rscratch, entry.base());
Address dispatch = entry.index();
assert(dispatch._base == noreg, "must be");
dispatch._base = rscratch;
jmp(dispatch);
}
void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) {
ShouldNotReachHere(); // 64bit doesn't use two regs
cmpq(x_lo, y_lo);
}
// src should NEVER be a real pointer. Use AddressLiteral for true pointers void MacroAssembler::movptr(Address dst, intptr_t src, Register rscratch) { if (is_simm32(src)) {
movptr(dst, checked_cast<int32_t>(src));
} else {
mov64(rscratch, src);
movq(dst, rscratch);
}
}
void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) { // In order to get locks to work, we need to fake a in_VM state if (ShowMessageBoxOnError) {
JavaThread* thread = JavaThread::current();
JavaThreadState saved_state = thread->thread_state();
thread->set_thread_state(_thread_in_vm); #ifndef PRODUCT if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
ttyLocker ttyl;
BytecodeCounter::print();
} #endif // To see where a verify_oop failed, get $ebx+40/X for this frame. // XXX correct this offset for amd64 // This is the value of eip which points to where verify_oop will return. if (os::message_box(msg, "Execution stopped, print registers?")) {
print_state64(pc, regs);
BREAKPOINT;
}
}
fatal("DEBUG MESSAGE: %s", msg);
}
void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) {
ttyLocker ttyl;
FlagSetting fs(Debugging, true);
tty->print_cr("rip = 0x%016lx", (intptr_t)pc); #ifndef PRODUCT
tty->cr();
findpc(pc);
tty->cr(); #endif #define PRINT_REG(rax, value) \
{ tty->print("%s = ", #rax); os::print_location(tty, value); }
PRINT_REG(rax, regs[15]);
PRINT_REG(rbx, regs[12]);
PRINT_REG(rcx, regs[14]);
PRINT_REG(rdx, regs[13]);
PRINT_REG(rdi, regs[8]);
PRINT_REG(rsi, regs[9]);
PRINT_REG(rbp, regs[10]); // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp
PRINT_REG(rsp, (intptr_t)(®s[16]));
PRINT_REG(r8 , regs[7]);
PRINT_REG(r9 , regs[6]);
PRINT_REG(r10, regs[5]);
PRINT_REG(r11, regs[4]);
PRINT_REG(r12, regs[3]);
PRINT_REG(r13, regs[2]);
PRINT_REG(r14, regs[1]);
PRINT_REG(r15, regs[0]); #undef PRINT_REG // Print some words near the top of the stack.
int64_t* rsp = ®s[16];
int64_t* dump_sp = rsp; for (int col1 = 0; col1 < 8; col1++) {
tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp);
os::print_location(tty, *dump_sp++);
} for (int row = 0; row < 25; row++) {
tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); for (int col = 0; col < 4; col++) {
tty->print(" 0x%016lx", (intptr_t)*dump_sp++);
}
tty->cr();
} // Print some instructions around pc:
Disassembler::decode((address)pc-64, (address)pc);
tty->print_cr("--------");
Disassembler::decode((address)pc, (address)pc+32);
}
// The java_calling_convention describes stack locations as ideal slots on // a frame with no abi restrictions. Since we must observe abi restrictions // (like the placement of the register window) the slots must be biased by // the following value. staticint reg2offset_in(VMReg r) { // Account for saved rbp and return address // This should really be in_preserve_stack_slots return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
}
// A double move void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
// The calling conventions assures us that each VMregpair is either // all really one physical register or adjacent stack slots.
if (src.is_single_phys_reg() ) { if (dst.is_single_phys_reg()) { // In theory these overlap but the ordering is such that this is likely a nop if ( src.first() != dst.first()) {
movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
}
} else {
assert(dst.is_single_reg(), "not a stack pair");
movdbl(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
}
} elseif (dst.is_single_phys_reg()) {
assert(src.is_single_reg(), "not a stack pair");
movdbl(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
} else {
assert(src.is_single_reg() && dst.is_single_reg(), "not stack pairs");
movq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
}
}
// A float arg may have to do float reg int reg conversion void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) {
assert(!src.second()->is_valid() && !dst.second()->is_valid(), "bad float_move");
// The calling conventions assures us that each VMregpair is either // all really one physical register or adjacent stack slots.
if (src.first()->is_stack()) { if (dst.first()->is_stack()) {
movl(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
movptr(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
} else { // stack to reg
assert(dst.first()->is_XMMRegister(), "only expect xmm registers as parameters");
movflt(dst.first()->as_XMMRegister(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
}
} elseif (dst.first()->is_stack()) { // reg to stack
assert(src.first()->is_XMMRegister(), "only expect xmm registers as parameters");
movflt(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_XMMRegister());
} else { // reg to reg // In theory these overlap but the ordering is such that this is likely a nop if ( src.first() != dst.first()) {
movdbl(dst.first()->as_XMMRegister(), src.first()->as_XMMRegister());
}
}
}
// On 64 bit we will store integer like items to the stack as // 64 bits items (x86_32/64 abi) even though java would only store // 32bits for a parameter. On 32bit it will simply be 32 bits // So this routine will do 32->32 on 32bit and 32->64 on 64bit void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp, int in_stk_bias, int out_stk_bias) { if (src.first()->is_stack()) { if (dst.first()->is_stack()) { // stack to stack
movslq(tmp, Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), tmp);
} else { // stack to reg
movslq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first()) + in_stk_bias));
}
} elseif (dst.first()->is_stack()) { // reg to stack // Do we really have to sign extend??? // __ movslq(src.first()->as_Register(), src.first()->as_Register());
movq(Address(rsp, reg2offset_out(dst.first()) + out_stk_bias), src.first()->as_Register());
} else { // Do we really have to sign extend??? // __ movslq(dst.first()->as_Register(), src.first()->as_Register()); if (dst.first() != src.first()) {
movq(dst.first()->as_Register(), src.first()->as_Register());
}
}
}
void MacroAssembler::move_ptr(VMRegPair src, VMRegPair dst) { if (src.first()->is_stack()) { if (dst.first()->is_stack()) { // stack to stack
movq(rax, Address(rbp, reg2offset_in(src.first())));
movq(Address(rsp, reg2offset_out(dst.first())), rax);
} else { // stack to reg
movq(dst.first()->as_Register(), Address(rbp, reg2offset_in(src.first())));
}
} elseif (dst.first()->is_stack()) { // reg to stack
movq(Address(rsp, reg2offset_out(dst.first())), src.first()->as_Register());
} else { if (dst.first() != src.first()) {
movq(dst.first()->as_Register(), src.first()->as_Register());
}
}
}
// An oop arg. Must pass a handle not the oop itself void MacroAssembler::object_move(OopMap* map, int oop_handle_offset, int framesize_in_slots,
VMRegPair src,
VMRegPair dst, bool is_receiver, int* receiver_offset) {
// must pass a handle. First figure out the location we use as a handle
// Oop is already on the stack as an argument int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots)); if (is_receiver) {
*receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
}
map->set_oop(VMRegImpl::stack2reg(oop_slot)); // Store oop in handle area, may be NULL
movptr(Address(rsp, offset), rOop); if (is_receiver) {
*receiver_offset = offset;
}
cmpptr(rOop, NULL_WORD);
lea(rHandle, Address(rsp, offset)); // conditionally move a NULL from the handle area where it was just stored
cmovptr(Assembler::equal, rHandle, Address(rsp, offset));
}
// If arg is on the stack then place it otherwise it is already in correct reg. if (dst.first()->is_stack()) {
movptr(Address(rsp, reg2offset_out(dst.first())), rHandle);
}
}
// See 8273459. Function for ensuring 64-byte alignment, intended for stubs only. // Stub code is generated once and never copied. // NMethods can't use this because they get copied and we can't force alignment > 32 bytes. void MacroAssembler::align64() {
align(64, (unsignedlonglong) pc());
}
void MacroAssembler::align(int modulus) { // 8273459: Ensure alignment is possible with current segment alignment
assert(modulus <= CodeEntryAlignment, "Alignment must be <= CodeEntryAlignment");
align(modulus, offset());
}
void MacroAssembler::align(int modulus, int target) { if (target % modulus != 0) {
nop(modulus - (target % modulus));
}
}
// Writes to stack successive pages until offset reached to check for // stack overflow + shadow pages. This clobbers tmp. void MacroAssembler::bang_stack_size(Register size, Register tmp) {
movptr(tmp, rsp); // Bang stack for total size given plus shadow page size. // Bang one page at a time because large size can bang beyond yellow and // red zones.
Label loop;
bind(loop);
movl(Address(tmp, (-os::vm_page_size())), size );
subptr(tmp, os::vm_page_size());
subl(size, os::vm_page_size());
jcc(Assembler::greater, loop);
// Bang down shadow pages too. // At this point, (tmp-0) is the last address touched, so don't // touch it again. (It was touched as (tmp-pagesize) but then tmp // was post-decremented.) Skip this address by starting at i=1, and // touch a few more pages below. N.B. It is important to touch all // the way down including all pages in the shadow zone. for (int i = 1; i < ((int)StackOverflow::stack_shadow_zone_size() / os::vm_page_size()); i++) { // this could be any sized move but this is can be a debugging crumb // so the bigger the better.
movptr(Address(tmp, (-i*os::vm_page_size())), size );
}
}
void MacroAssembler::reserved_stack_check() { // testing if reserved zone needs to be enabled
Label no_reserved_zone_enabling; Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread);
NOT_LP64(get_thread(rsi);)
void MacroAssembler::c2bool(Register x) { // implements x == 0 ? 0 : 1 // note: must only look at least-significant byte of x // since C-style booleans are stored in one byte // only! (was bug)
andl(x, 0xFF);
setb(Assembler::notZero, x);
}
// Wouldn't need if AddressLiteral version had new name void MacroAssembler::call(Label& L, relocInfo::relocType rtype) {
Assembler::call(L, rtype);
}
void MacroAssembler::emit_static_call_stub() { // Static stub relocation also tags the Method* in the code-stream.
mov_metadata(rbx, (Metadata*) NULL); // Method is zapped till fixup time. // This is recognized as unresolved by relocs/nativeinst/ic code.
jump(RuntimeAddress(pc()));
}
void MacroAssembler::call_VM_base(Register oop_result, Register java_thread, Register last_java_sp,
address entry_point, int number_of_arguments, bool check_exceptions) { // determine java_thread register if (!java_thread->is_valid()) { #ifdef _LP64
java_thread = r15_thread; #else
java_thread = rdi;
get_thread(java_thread); #endif// LP64
} // determine last_java_sp register if (!last_java_sp->is_valid()) {
last_java_sp = rsp;
} // debugging support
assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
LP64_ONLY(assert(java_thread == r15_thread, "unexpected register")); #ifdef ASSERT // TraceBytecodes does not use r12 but saves it over the call, so don't verify // r12 is the heapbase.
LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");) #endif// ASSERT
assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
// push java thread (becomes first argument of C function)
// set last Java frame before call
assert(last_java_sp != rbp, "can't use ebp/rbp");
// Only interpreter should have to set fp
set_last_Java_frame(java_thread, last_java_sp, rbp, NULL, rscratch1);
// do the call, remove parameters
MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
// restore the thread (cannot use the pushed argument since arguments // may be overwritten by C code generated by an optimizing compiler); // however can use the register value directly if it is callee saved. if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) { // rdi & rsi (also r15) are callee saved -> nothing to do #ifdef ASSERT
guarantee(java_thread != rax, "change this code");
push(rax);
{ Label L;
get_thread(rax);
cmpptr(java_thread, rax);
jcc(Assembler::equal, L);
STOP("MacroAssembler::call_VM_base: rdi not callee saved?");
bind(L);
}
pop(rax); #endif
} else {
get_thread(java_thread);
} // reset last Java frame // Only interpreter should have to clear fp
reset_last_Java_frame(java_thread, true);
// C++ interp handles this in the interpreter
check_and_handle_popframe(java_thread);
check_and_handle_earlyret(java_thread);
if (check_exceptions) { // check for pending exceptions (java_thread is set upon return)
cmpptr(Address(java_thread, Thread::pending_exception_offset()), NULL_WORD); #ifndef _LP64
jump_cc(Assembler::notEqual,
RuntimeAddress(StubRoutines::forward_exception_entry())); #else // This used to conditionally jump to forward_exception however it is // possible if we relocate that the branch will not reach. So we must jump // around so we can always reach
// get oop result if there is one and reset the value in the thread if (oop_result->is_valid()) {
get_vm_result(oop_result, java_thread);
}
}
void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
// Calculate the value for last_Java_sp // somewhat subtle. call_VM does an intermediate call // which places a return address on the stack just under the // stack pointer as the user finished with it. This allows // use to retrieve last_Java_pc from last_Java_sp[-1]. // On 32bit we then have to push additional args on the stack to accomplish // the actual requested call. On 64bit call_VM only can use register args // so the only extra space is the return address that call_VM created. // This hopefully explains the calculations here.
// Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter. void MacroAssembler::call_VM_leaf0(address entry_point) {
MacroAssembler::call_VM_leaf_base(entry_point, 0);
}
void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
call_VM_leaf_base(entry_point, number_of_arguments);
}
int MacroAssembler::corrected_idivl(Register reg) { // Full implementation of Java idiv and irem; checks for // special case as described in JVM spec., p.243 & p.271. // The function returns the (pc) offset of the idivl // instruction - may be needed for implicit exceptions. // // normal case special case // // input : rax,: dividend min_int // reg: divisor (may not be rax,/rdx) -1 // // output: rax,: quotient (= rax, idiv reg) min_int // rdx: remainder (= rax, irem reg) 0
assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register"); constint min_int = 0x80000000;
Label normal_case, special_case;
// check for special case
cmpl(rax, min_int);
jcc(Assembler::notEqual, normal_case);
xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0)
cmpl(reg, -1);
jcc(Assembler::equal, special_case);
// handle normal case
bind(normal_case);
cdql(); int idivl_offset = offset();
idivl(reg);
// normal and special case exit
bind(special_case);
// dst = c = a * b + c void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
Assembler::vfmadd231sd(c, a, b); if (dst != c) {
movdbl(dst, c);
}
}
// dst = c = a * b + c void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
Assembler::vfmadd231ss(c, a, b); if (dst != c) {
movflt(dst, c);
}
}
// dst = c = a * b + c void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
Assembler::vfmadd231pd(c, a, b, vector_len); if (dst != c) {
vmovdqu(dst, c);
}
}
// dst = c = a * b + c void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) {
Assembler::vfmadd231ps(c, a, b, vector_len); if (dst != c) {
vmovdqu(dst, c);
}
}
// dst = c = a * b + c void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
Assembler::vfmadd231pd(c, a, b, vector_len); if (dst != c) {
vmovdqu(dst, c);
}
}
// dst = c = a * b + c void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) {
Assembler::vfmadd231ps(c, a, b, vector_len); if (dst != c) {
vmovdqu(dst, c);
}
}
int MacroAssembler::load_signed_byte(Register dst, Address src) { int off; if (LP64_ONLY(true ||) VM_Version::is_P6()) {
off = offset();
movsbl(dst, src); // movsxb
} else {
off = load_unsigned_byte(dst, src);
shll(dst, 24);
sarl(dst, 24);
} return off;
}
// Note: load_signed_short used to be called load_signed_word. // Although the 'w' in x86 opcodes refers to the term "word" in the assembler // manual, which means 16 bits, that usage is found nowhere in HotSpot code. // The term "word" in HotSpot means a 32- or 64-bit machine word. int MacroAssembler::load_signed_short(Register dst, Address src) { int off; if (LP64_ONLY(true ||) VM_Version::is_P6()) { // This is dubious to me since it seems safe to do a signed 16 => 64 bit // version but this is what 64bit has always done. This seems to imply // that users are only using 32bits worth.
off = offset();
movswl(dst, src); // movsxw
} else {
off = load_unsigned_short(dst, src);
shll(dst, 16);
sarl(dst, 16);
} return off;
}
int MacroAssembler::load_unsigned_byte(Register dst, Address src) { // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, // and "3.9 Partial Register Penalties", p. 22). int off; if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) {
off = offset();
movzbl(dst, src); // movzxb
} else {
xorl(dst, dst);
off = offset();
movb(dst, src);
} return off;
}
// Note: load_unsigned_short used to be called load_unsigned_word. int MacroAssembler::load_unsigned_short(Register dst, Address src) { // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, // and "3.9 Partial Register Penalties", p. 22). int off; if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) {
off = offset();
movzwl(dst, src); // movzxw
} else {
xorl(dst, dst);
off = offset();
movw(dst, src);
} return off;
}
// src should NEVER be a real pointer. Use AddressLiteral for true pointers void MacroAssembler::movptr(Register dst, intptr_t src) {
LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src));
}
void MacroAssembler::null_check(Register reg, int offset) { if (needs_explicit_null_check(offset)) { // provoke OS NULL exception if reg = NULL by // accessing M[reg] w/o changing any (non-CC) registers // NOTE: cmpl is plenty here to provoke a segv
cmpptr(rax, Address(reg, 0)); // Note: should probably use testl(rax, Address(reg, 0)); // may be shorter code (however, this version of // testl needs to be implemented first)
} else { // nothing to do, (later) access of M[reg + offset] // will provoke OS NULL exception if reg = NULL
}
}
void MacroAssembler::os_breakpoint() { // instead of directly emitting a breakpoint, call os:breakpoint for better debugability // (e.g., MSVC can't call ps() otherwise)
call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint)));
}
// Save Integer and Float state // Warning: Stack must be 16 byte aligned (64bit) void MacroAssembler::push_CPU_state() {
push_IU_state();
push_FPU_state();
}
// The code below wants the original RSP. // Move it back after the pushes above.
movptr(rrealsp, rsp);
addptr(rrealsp, 2*wordSize); #else Register rthread = r15_thread; Register rrealsp = rsp; #endif
// The code below wants the original RSP. // Move it back after the pushes above.
movptr(rrealsp, rsp);
addptr(rrealsp, 2*wordSize); #else Register rthread = r15_thread; Register rrealsp = rsp; #endif
void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register if (!java_thread->is_valid()) {
java_thread = rdi;
get_thread(java_thread);
} // we must set sp to zero to clear frame
movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); // must clear fp, so that compiled frames are not confused; it is // possible that we need it only for debugging if (clear_fp) {
movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD);
} // Always clear the pc because it could have been set by make_walkable()
movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD);
vzeroupper();
}
void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, bool at_return, bool in_nmethod) { if (at_return) { // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore, // we may safely use rsp instead to perform the stack watermark check.
cmpptr(in_nmethod ? rsp : rbp, Address(thread_reg, JavaThread::polling_word_offset()));
jcc(Assembler::above, slow_path); return;
}
testb(Address(thread_reg, JavaThread::polling_word_offset()), SafepointMechanism::poll_bit());
jcc(Assembler::notZero, slow_path); // handshake bit set implies poll
}
// Calls to C land // // When entering C land, the rbp, & rsp of the last Java frame have to be recorded // in the (thread-local) JavaThread object. When leaving C land, the last Java fp // has to be reset to 0. This is required to allow proper stack traversal. void MacroAssembler::set_last_Java_frame(Register java_thread, Register last_java_sp, Register last_java_fp,
address last_java_pc, Register rscratch) {
vzeroupper(); // determine java_thread register if (!java_thread->is_valid()) {
java_thread = rdi;
get_thread(java_thread);
} // determine last_java_sp register if (!last_java_sp->is_valid()) {
last_java_sp = rsp;
} // last_java_fp is optional if (last_java_fp->is_valid()) {
movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp);
} // last_java_pc is optional if (last_java_pc != NULL) {
Address java_pc(java_thread,
JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset());
lea(java_pc, InternalAddress(last_java_pc), rscratch);
}
movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp);
}
// Force generation of a 4 byte immediate value even if it fits into 8bit void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) {
LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32));
}
int gp_area_size; int fp_area_size; int xmm_area_size; int total_save_size = register_section_sizes(gp_registers_to_push, call_clobbered_xmm_registers(), save_fpu,
gp_area_size, fp_area_size, xmm_area_size);
subptr(rsp, total_save_size);
int gp_area_size; int fp_area_size; int xmm_area_size; int total_save_size = register_section_sizes(gp_registers_to_pop, call_clobbered_xmm_registers(), restore_fpu,
gp_area_size, fp_area_size, xmm_area_size);
void MacroAssembler::push_set(XMMRegSet set, int offset) {
assert(is_aligned(set.size() * xmm_save_size(), StackAlignmentInBytes), "must be"); int spill_offset = offset;
for (RegSetIterator<XMMRegister> it = set.begin(); *it != xnoreg; ++it) {
save_xmm_register(this, spill_offset, *it);
spill_offset += xmm_save_size();
}
}
void MacroAssembler::pop_set(XMMRegSet set, int offset) { int restore_size = set.size() * xmm_save_size();
assert(is_aligned(restore_size, StackAlignmentInBytes), "must be");
int restore_offset = offset + restore_size - xmm_save_size();
for (ReverseRegSetIterator<XMMRegister> it = set.rbegin(); *it != xnoreg; ++it) {
restore_xmm_register(this, restore_offset, *it);
restore_offset -= xmm_save_size();
}
}
void MacroAssembler::push_set(RegSet set, int offset) { int spill_offset; if (offset == -1) { int register_push_size = set.size() * Register::max_slots_per_register * VMRegImpl::stack_slot_size; int aligned_size = align_up(register_push_size, StackAlignmentInBytes);
subptr(rsp, aligned_size);
spill_offset = 0;
} else {
spill_offset = offset;
}
for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
movptr(Address(rsp, spill_offset), *it);
spill_offset += Register::max_slots_per_register * VMRegImpl::stack_slot_size;
}
}
void MacroAssembler::pop_set(RegSet set, int offset) {
int gp_reg_size = Register::max_slots_per_register * VMRegImpl::stack_slot_size; int restore_size = set.size() * gp_reg_size; int aligned_size = align_up(restore_size, StackAlignmentInBytes);
int restore_offset; if (offset == -1) {
restore_offset = restore_size - gp_reg_size;
} else {
restore_offset = offset + restore_size - gp_reg_size;
} for (ReverseRegSetIterator<Register> it = set.rbegin(); *it != noreg; ++it) {
movptr(*it, Address(rsp, restore_offset));
restore_offset -= gp_reg_size;
}
if (offset == -1) {
addptr(rsp, aligned_size);
}
}
// Preserves the contents of address, destroys the contents length_in_bytes and temp. void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) {
assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different");
assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord");
Label done;
// initialize topmost word, divide index by 2, check if odd and test if zero // note: for the remaining code to work, index must be a multiple of BytesPerWord #ifdef ASSERT
{
Label L;
testptr(length_in_bytes, BytesPerWord - 1);
jcc(Assembler::zero, L);
stop("length must be a multiple of BytesPerWord");
bind(L);
} #endif Register index = length_in_bytes;
xorptr(temp, temp); // use _zero reg to clear memory (shorter code) if (UseIncDec) {
shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set
} else {
shrptr(index, 2); // use 2 instructions to avoid partial flag stall
shrptr(index, 1);
} #ifndef _LP64 // index could have not been a multiple of 8 (i.e., bit 2 was set)
{
Label even; // note: if index was a multiple of 8, then it cannot // be 0 now otherwise it must have been 0 before // => if it is even, we don't need to check for 0 again
jcc(Assembler::carryClear, even); // clear topmost word (no jump would be needed if conditional assignment worked here)
movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp); // index could be 0 now, must check again
jcc(Assembler::zero, done);
bind(even);
} #endif// !_LP64 // initialize remaining object fields: index is a multiple of 2 now
{
Label loop;
bind(loop);
movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp);
NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);)
decrement(index);
jcc(Assembler::notZero, loop);
}
bind(done);
}
// Look up the method for a megamorphic invokeinterface call. // The target method is determined by <intf_klass, itable_index>. // The receiver klass is in recv_klass. // On success, the result will be in method_result, and execution falls through. // On failure, execution transfers to the given label. void MacroAssembler::lookup_interface_method(Register recv_klass, Register intf_klass,
RegisterOrConstant itable_index, Register method_result, Register scan_temp,
Label& L_no_such_interface, bool return_method) {
assert_different_registers(recv_klass, intf_klass, scan_temp);
assert_different_registers(method_result, intf_klass, scan_temp);
assert(recv_klass != method_result || !return_method, "recv_klass can be destroyed when method isn't needed");
assert(itable_index.is_constant() || itable_index.as_register() == method_result, "caller must use same register for non-constant itable index as for method");
// Compute start of first itableOffsetEntry (which is at the end of the vtable) int vtable_base = in_bytes(Klass::vtable_start_offset()); int itentry_off = itableMethodEntry::method_offset_in_bytes(); int scan_step = itableOffsetEntry::size() * wordSize; int vte_size = vtableEntry::size_in_bytes();
Address::ScaleFactor times_vte_scale = Address::times_ptr;
assert(vte_size == wordSize, "else adjust times_vte_scale");
// %%% Could store the aligned, prescaled offset in the klassoop.
lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
if (return_method) { // Adjust recv_klass by scaled itable_index, so we can free itable_index.
assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
}
if (peel) {
jccb(Assembler::equal, found_method);
} else {
jccb(Assembler::notEqual, search); // (invert the test to fall through to found_method...)
}
if (!peel) break;
bind(search);
// Check that the previous entry is non-null. A null entry means that // the receiver class doesn't implement the interface, and wasn't the // same as when the caller was compiled.
testptr(method_result, method_result);
jcc(Assembler::zero, L_no_such_interface);
addptr(scan_temp, scan_step);
}
bind(found_method);
if (return_method) { // Got a hit.
movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
movptr(method_result, Address(recv_klass, scan_temp, Address::times_1));
}
}
// virtual method calling void MacroAssembler::lookup_virtual_method(Register recv_klass,
RegisterOrConstant vtable_index, Register method_result) { constint base = in_bytes(Klass::vtable_start_offset());
assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below");
Address vtable_entry_addr(recv_klass,
vtable_index, Address::times_ptr,
base + vtableEntry::method_offset_in_bytes());
movptr(method_result, vtable_entry_addr);
}
void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, Register super_klass, Register temp_reg,
Label* L_success,
Label* L_failure,
Label* L_slow_path,
RegisterOrConstant super_check_offset) {
assert_different_registers(sub_klass, super_klass, temp_reg); bool must_load_sco = (super_check_offset.constant_or_zero() == -1); if (super_check_offset.is_register()) {
assert_different_registers(sub_klass, super_klass,
super_check_offset.as_register());
} elseif (must_load_sco) {
assert(temp_reg != noreg, "supply either a temp or a register offset");
}
Label L_fallthrough; int label_nulls = 0; if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
assert(label_nulls <= 1, "at most one NULL in the batch");
int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); int sco_offset = in_bytes(Klass::super_check_offset_offset());
Address super_check_offset_addr(super_klass, sco_offset);
// Hacked jcc, which "knows" that L_fallthrough, at least, is in // range of a jccb. If this routine grows larger, reconsider at // least some of these. #define local_jcc(assembler_cond, label) \ if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \ else jcc( assembler_cond, label) /*omit semi*/
// Hacked jmp, which may only be used just before L_fallthrough. #define final_jmp(label) \ if (&(label) == &L_fallthrough) { /*do nothing*/ } \ else jmp(label) /*omit semi*/
// If the pointers are equal, we are done (e.g., String[] elements). // This self-check enables sharing of secondary supertype arrays among // non-primary types such as array-of-interface. Otherwise, each such // type would need its own customized SSA. // We move this check to the front of the fast path because many // type checks are in fact trivially successful in this manner, // so we get a nicely predicted branch right at the start of the check.
cmpptr(sub_klass, super_klass);
local_jcc(Assembler::equal, *L_success);
// Check the supertype display: if (must_load_sco) { // Positive movl does right thing on LP64.
movl(temp_reg, super_check_offset_addr);
super_check_offset = RegisterOrConstant(temp_reg);
}
Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0);
cmpptr(super_klass, super_check_addr); // load displayed supertype
// This check has worked decisively for primary supers. // Secondary supers are sought in the super_cache ('super_cache_addr'). // (Secondary supers are interfaces and very deeply nested subtypes.) // This works in the same check above because of a tricky aliasing // between the super_cache and the primary super display elements. // (The 'super_check_addr' can address either, as the case requires.) // Note that the cache is updated below if it does not help us find // what we need immediately. // So if it was a primary super, we can just fail immediately. // Otherwise, it's the slow path for us (no success at this point).
if (super_check_offset.is_register()) {
local_jcc(Assembler::equal, *L_success);
cmpl(super_check_offset.as_register(), sc_offset); if (L_failure == &L_fallthrough) {
local_jcc(Assembler::equal, *L_slow_path);
} else {
local_jcc(Assembler::notEqual, *L_failure);
final_jmp(*L_slow_path);
}
} elseif (super_check_offset.as_constant() == sc_offset) { // Need a slow path; fast failure is impossible. if (L_slow_path == &L_fallthrough) {
local_jcc(Assembler::equal, *L_success);
} else {
local_jcc(Assembler::notEqual, *L_slow_path);
final_jmp(*L_success);
}
} else { // No slow path; it's a fast decision. if (L_failure == &L_fallthrough) {
local_jcc(Assembler::equal, *L_success);
} else {
local_jcc(Assembler::notEqual, *L_failure);
final_jmp(*L_success);
}
}
Label L_fallthrough; int label_nulls = 0; if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
assert(label_nulls <= 1, "at most one NULL in the batch");
// a couple of useful fields in sub_klass: int ss_offset = in_bytes(Klass::secondary_supers_offset()); int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
Address secondary_supers_addr(sub_klass, ss_offset);
Address super_cache_addr( sub_klass, sc_offset);
// Do a linear scan of the secondary super-klass chain. // This code is rarely used, so simplicity is a virtue here. // The repne_scan instruction uses fixed registers, which we must spill. // Don't worry too much about pre-existing connections with the input regs.
assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super)
assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter)
// Get super_klass value into rax (even if it was in rdi or rcx). bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false; if (super_klass != rax) { if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; }
mov(rax, super_klass);
} if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; } if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; }
// We will consult the secondary-super array.
movptr(rdi, secondary_supers_addr); // Load the array length. (Positive movl does right thing on LP64.)
movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes())); // Skip to start of data.
addptr(rdi, Array<Klass*>::base_offset_in_bytes());
// Scan RCX words at [RDI] for an occurrence of RAX. // Set NZ/Z based on last compare. // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does // not change flags (only scas instruction which is repeated sets flags). // Set Z = 0 (not equal) before 'repne' to indicate that class was not found.
testptr(rax,rax); // Set Z = 0
repne_scan();
// Unspill the temp. registers: if (pushed_rdi) pop(rdi); if (pushed_rcx) pop(rcx); if (pushed_rax) pop(rax);
if (set_cond_codes) { // Special hack for the AD files: rdi is guaranteed non-zero.
assert(!pushed_rdi, "rdi must be left non-NULL"); // Also, the condition codes are properly set Z/NZ on succeed/failure.
}
if (L_failure == &L_fallthrough)
jccb(Assembler::notEqual, *L_failure); else jcc(Assembler::notEqual, *L_failure);
// Success. Cache the super we found and proceed in triumph.
movptr(super_cache_addr, super_klass);
if (L_success != &L_fallthrough) {
jmp(*L_success);
}
#undef IS_A_TEMP
bind(L_fallthrough);
}
void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
// Fast path check: class is fully initialized
cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized);
jcc(Assembler::equal, *L_fast_path);
// Fast path check: current thread is initializer thread
cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset())); if (L_slow_path == &L_fallthrough) {
jcc(Assembler::equal, *L_fast_path);
bind(*L_slow_path);
} elseif (L_fast_path == &L_fallthrough) {
jcc(Assembler::notEqual, *L_slow_path);
bind(*L_fast_path);
} else {
Unimplemented();
}
}
// Pass register number to verify_oop_subroutine constchar* b = NULL;
{
ResourceMark rm;
stringStream ss;
ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
b = code_string(ss.as_string());
}
ExternalAddress buffer((address) b);
pushptr(buffer.addr(), rscratch1);
// call indirectly to solve generation ordering problem
movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
call(rax); // Caller pops the arguments (oop, message) and restores rax, r10
BLOCK_COMMENT("} verify_oop");
}
void MacroAssembler::vallones(XMMRegister dst, int vector_len) { if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { // Only pcmpeq has dependency breaking treatment (i.e the execution can begin without // waiting for the previous result on dst), not vpcmpeqd, so just use vpternlog
vpternlogd(dst, 0xFF, dst, dst, vector_len);
} elseif (VM_Version::supports_avx()) {
vpcmpeqd(dst, dst, dst, vector_len);
} else {
assert(VM_Version::supports_sse2(), "");
pcmpeqd(dst, dst);
}
}
Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, int extra_slot_offset) { // cf. TemplateTable::prepare_invoke(), if (load_receiver). int stackElementSize = Interpreter::stackElementSize; int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); #ifdef ASSERT int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
assert(offset1 - offset == stackElementSize, "correct arithmetic"); #endif Register scale_reg = noreg;
Address::ScaleFactor scale_factor = Address::no_scale; if (arg_slot.is_constant()) {
offset += arg_slot.as_constant() * stackElementSize;
} else {
scale_reg = arg_slot.as_register();
scale_factor = Address::times(stackElementSize);
}
offset += wordSize; // return PC is on stack return Address(rsp, scale_reg, scale_factor, offset);
}
void MacroAssembler::_verify_oop_addr(Address addr, constchar* s, constchar* file, int line) { if (!VerifyOops) return;
#ifdef _LP64
push(rscratch1); #endif
push(rax); // save rax, // addr may contain rsp so we will have to adjust it based on the push // we just did (and on 64 bit we do two pushes) // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which // stores rax into addr which is backwards of what was intended. if (addr.uses(rsp)) {
lea(rax, addr);
pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord));
} else {
pushptr(addr);
}
// Pass register number to verify_oop_subroutine constchar* b = NULL;
{
ResourceMark rm;
stringStream ss;
ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
b = code_string(ss.as_string());
}
ExternalAddress buffer((address) b);
pushptr(buffer.addr(), rscratch1);
// call indirectly to solve generation ordering problem
movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
call(rax); // Caller pops the arguments (addr, message) and restores rax, r10.
}
void MacroAssembler::print_CPU_state() {
push_CPU_state();
push(rsp); // pass CPU state
call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state)));
addptr(rsp, wordSize); // discard argument
pop_CPU_state();
}
#ifndef _LP64 staticbool _verify_FPU(int stack_depth, char* s, CPU_State* state) { staticint counter = 0;
FPU_State* fs = &state->_fpu_state;
counter++; // For leaf calls, only verify that the top few elements remain empty. // We only need 1 empty at the top for C2 code. if( stack_depth < 0 ) { if( fs->tag_for_st(7) != 3 ) {
printf("FPR7 not empty\n");
state->print();
assert(false, "error"); returnfalse;
} returntrue; // All other stack states do not matter
}
assert((fs->_control_word._value & 0xffff) == StubRoutines::x86::fpu_cntrl_wrd_std(), "bad FPU control word");
// compute stack depth int i = 0; while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++; int d = i; while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++; // verify findings if (i != FPU_State::number_of_registers) { // stack not contiguous
printf("%s: stack not contiguous at ST%d\n", s, i);
state->print();
assert(false, "error"); returnfalse;
} // check if computed stack depth corresponds to expected stack depth if (stack_depth < 0) { // expected stack depth is -stack_depth or less if (d > -stack_depth) { // too many elements on the stack
printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d);
state->print();
assert(false, "error"); returnfalse;
}
} else { // expected stack depth is stack_depth if (d != stack_depth) { // wrong stack depth
printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d);
state->print();
assert(false, "error"); returnfalse;
}
} // everything is cool returntrue;
}
void MacroAssembler::restore_cpu_control_state_after_jni(Register rscratch) { // Either restore the MXCSR register after returning from the JNI Call // or verify that it wasn't changed (with -Xcheck:jni flag). if (VM_Version::supports_sse()) { if (RestoreMXCSROnJNICalls) {
ldmxcsr(ExternalAddress(StubRoutines::x86::addr_mxcsr_std()), rscratch);
} elseif (CheckJNICalls) {
call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry()));
}
} // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty.
vzeroupper();
#ifndef _LP64 // Either restore the x87 floating pointer control word after returning // from the JNI call or verify that it wasn't changed. if (CheckJNICalls) {
call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry()));
} #endif// _LP64
}
// Only 64 bit platforms support GCs that require a tmp register // Only IN_HEAP loads require a thread_tmp register // OopHandle::resolve is an indirection like jobject.
access_load_at(T_OBJECT, IN_NATIVE,
result, Address(result, 0), tmp, /*tmp_thread*/noreg);
}
// A null weak handle resolves to null.
cmpptr(rresult, 0);
jcc(Assembler::equal, resolved);
// Only 64 bit platforms support GCs that require a tmp register // Only IN_HEAP loads require a thread_tmp register // WeakHandle::resolve is an indirection like jweak.
access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg);
bind(resolved);
}
// Used for storing NULLs. void MacroAssembler::store_heap_oop_null(Address dst) {
access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
}
#ifdef _LP64 void MacroAssembler::store_klass_gap(Register dst, Register src) { if (UseCompressedClassPointers) { // Store to klass gap in destination
movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src);
}
}
#ifdef ASSERT void MacroAssembler::verify_heapbase(constchar* msg) {
assert (UseCompressedOops, "should be compressed");
assert (Universe::heap() != NULL, "java heap should be initialized"); if (CheckCompressedOops) {
Label ok;
ExternalAddress src2(CompressedOops::ptrs_base_addr()); constbool is_src2_reachable = reachable(src2); if (!is_src2_reachable) {
push(rscratch1); // cmpptr trashes rscratch1
}
cmpptr(r12_heapbase, src2, rscratch1);
jcc(Assembler::equal, ok);
STOP(msg);
bind(ok); if (!is_src2_reachable) {
pop(rscratch1);
}
}
} #endif
// Algorithm must match oop.inline.hpp encode_heap_oop. void MacroAssembler::encode_heap_oop(Register r) { #ifdef ASSERT
verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); #endif
verify_oop_msg(r, "broken oop in encode_heap_oop"); if (CompressedOops::base() == NULL) { if (CompressedOops::shift() != 0) {
assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
shrq(r, LogMinObjAlignmentInBytes);
} return;
}
testq(r, r);
cmovq(Assembler::equal, r, r12_heapbase);
subq(r, r12_heapbase);
shrq(r, LogMinObjAlignmentInBytes);
}
void MacroAssembler::encode_heap_oop_not_null(Register r) { #ifdef ASSERT
verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); if (CheckCompressedOops) {
Label ok;
testq(r, r);
jcc(Assembler::notEqual, ok);
STOP("null oop passed to encode_heap_oop_not_null");
bind(ok);
} #endif
verify_oop_msg(r, "broken oop in encode_heap_oop_not_null"); if (CompressedOops::base() != NULL) {
subq(r, r12_heapbase);
} if (CompressedOops::shift() != 0) {
assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
shrq(r, LogMinObjAlignmentInBytes);
}
}
void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { #ifdef ASSERT
verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); if (CheckCompressedOops) {
Label ok;
testq(src, src);
jcc(Assembler::notEqual, ok);
STOP("null oop passed to encode_heap_oop_not_null2");
bind(ok);
} #endif
verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2"); if (dst != src) {
movq(dst, src);
} if (CompressedOops::base() != NULL) {
subq(dst, r12_heapbase);
} if (CompressedOops::shift() != 0) {
assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
shrq(dst, LogMinObjAlignmentInBytes);
}
}
void MacroAssembler::decode_heap_oop_not_null(Register r) { // Note: it will change flags
assert (UseCompressedOops, "should only be used for compressed headers");
assert (Universe::heap() != NULL, "java heap should be initialized"); // Cannot assert, unverified entry point counts instructions (see .ad file) // vtableStubs also counts instructions in pd_code_size_limit. // Also do not verify_oop as this is called by verify_oop. if (CompressedOops::shift() != 0) {
assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
shlq(r, LogMinObjAlignmentInBytes); if (CompressedOops::base() != NULL) {
addq(r, r12_heapbase);
}
} else {
assert (CompressedOops::base() == NULL, "sanity");
}
}
void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { // Note: it will change flags
assert (UseCompressedOops, "should only be used for compressed headers");
assert (Universe::heap() != NULL, "java heap should be initialized"); // Cannot assert, unverified entry point counts instructions (see .ad file) // vtableStubs also counts instructions in pd_code_size_limit. // Also do not verify_oop as this is called by verify_oop. if (CompressedOops::shift() != 0) {
assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); if (LogMinObjAlignmentInBytes == Address::times_8) {
leaq(dst, Address(r12_heapbase, src, Address::times_8, 0));
} else { if (dst != src) {
movq(dst, src);
}
shlq(dst, LogMinObjAlignmentInBytes); if (CompressedOops::base() != NULL) {
addq(dst, r12_heapbase);
}
}
} else {
assert (CompressedOops::base() == NULL, "sanity"); if (dst != src) {
movq(dst, src);
}
}
}
void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
assert_different_registers(r, tmp); // Note: it will change flags
assert(UseCompressedClassPointers, "should only be used for compressed headers"); // Cannot assert, unverified entry point counts instructions (see .ad file) // vtableStubs also counts instructions in pd_code_size_limit. // Also do not verify_oop as this is called by verify_oop. if (CompressedKlassPointers::shift() != 0) {
assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
shlq(r, LogKlassAlignmentInBytes);
} if (CompressedKlassPointers::base() != NULL) {
mov64(tmp, (int64_t)CompressedKlassPointers::base());
addq(r, tmp);
}
}
void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) {
assert_different_registers(src, dst); // Note: it will change flags
assert (UseCompressedClassPointers, "should only be used for compressed headers"); // Cannot assert, unverified entry point counts instructions (see .ad file) // vtableStubs also counts instructions in pd_code_size_limit. // Also do not verify_oop as this is called by verify_oop.
if (CompressedKlassPointers::base() == NULL &&
CompressedKlassPointers::shift() == 0) { // The best case scenario is that there is no base or shift. Then it is already // a pointer that needs nothing but a register rename.
movl(dst, src);
} else { if (CompressedKlassPointers::base() != NULL) {
mov64(dst, (int64_t)CompressedKlassPointers::base());
} else {
xorq(dst, dst);
} if (CompressedKlassPointers::shift() != 0) {
assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?");
leaq(dst, Address(dst, src, Address::times_8, 0));
} else {
addq(dst, src);
}
}
}
void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
assert (UseCompressedOops, "should only be used for compressed headers");
assert (Universe::heap() != NULL, "java heap should be initialized");
assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); int oop_index = oop_recorder()->find_index(obj);
RelocationHolder rspec = oop_Relocation::spec(oop_index);
mov_narrow_oop(dst, oop_index, rspec);
}
void MacroAssembler::set_narrow_oop(Address dst, jobject obj) {
assert (UseCompressedOops, "should only be used for compressed headers");
assert (Universe::heap() != NULL, "java heap should be initialized");
assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); int oop_index = oop_recorder()->find_index(obj);
RelocationHolder rspec = oop_Relocation::spec(oop_index);
mov_narrow_oop(dst, oop_index, rspec);
}
void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
assert (UseCompressedClassPointers, "should only be used for compressed headers");
assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); int klass_index = oop_recorder()->find_index(k);
RelocationHolder rspec = metadata_Relocation::spec(klass_index);
mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
}
void MacroAssembler::set_narrow_klass(Address dst, Klass* k) {
assert (UseCompressedClassPointers, "should only be used for compressed headers");
assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); int klass_index = oop_recorder()->find_index(k);
RelocationHolder rspec = metadata_Relocation::spec(klass_index);
mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
}
void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) {
assert (UseCompressedOops, "should only be used for compressed headers");
assert (Universe::heap() != NULL, "java heap should be initialized");
assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); int oop_index = oop_recorder()->find_index(obj);
RelocationHolder rspec = oop_Relocation::spec(oop_index);
Assembler::cmp_narrow_oop(dst, oop_index, rspec);
}
void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) {
assert (UseCompressedOops, "should only be used for compressed headers");
assert (Universe::heap() != NULL, "java heap should be initialized");
assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); int oop_index = oop_recorder()->find_index(obj);
RelocationHolder rspec = oop_Relocation::spec(oop_index);
Assembler::cmp_narrow_oop(dst, oop_index, rspec);
}
void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) {
assert (UseCompressedClassPointers, "should only be used for compressed headers");
assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); int klass_index = oop_recorder()->find_index(k);
RelocationHolder rspec = metadata_Relocation::spec(klass_index);
Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
}
void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) {
assert (UseCompressedClassPointers, "should only be used for compressed headers");
assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); int klass_index = oop_recorder()->find_index(k);
RelocationHolder rspec = metadata_Relocation::spec(klass_index);
Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec);
}
void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large, KRegister mask) { // cnt - number of qwords (8-byte words). // base - start address, qword aligned. // is_large - if optimizers know cnt is larger than InitArrayShortSize
assert(base==rdi, "base register must be edi for rep stos");
assert(tmp==rax, "tmp register must be eax for rep stos");
assert(cnt==rcx, "cnt register must be ecx for rep stos");
assert(InitArrayShortSize % BytesPerLong == 0, "InitArrayShortSize should be the multiple of BytesPerLong");
Label DONE; if (!is_large || !UseXMMForObjInit) {
xorptr(tmp, tmp);
}
if (!is_large) {
Label LOOP, LONG;
cmpptr(cnt, InitArrayShortSize/BytesPerLong);
jccb(Assembler::greater, LONG);
NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
decrement(cnt);
jccb(Assembler::negative, DONE); // Zero length
// Use individual pointer-sized stores for small counts:
BIND(LOOP);
movptr(Address(base, cnt, Address::times_ptr), tmp);
decrement(cnt);
jccb(Assembler::greaterEqual, LOOP);
jmpb(DONE);
BIND(LONG);
}
// Use longer rep-prefixed ops for non-small counts: if (UseFastStosb) {
shlptr(cnt, 3); // convert to number of bytes
rep_stosb();
} elseif (UseXMMForObjInit) {
xmm_clear_mem(base, cnt, tmp, xtmp, mask);
} else {
NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM
rep_stos();
}
// // length is too short, just fill qwords //
BIND(L_fill_8_bytes_loop);
movl(Address(to, 0), value);
movl(Address(to, 4), value);
addptr(to, 8);
BIND(L_fill_8_bytes);
subl(count, 1 << (shift + 1));
jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); // fall through to fill 4 bytes
} else {
Label L_fill_32_bytes; if (!UseUnalignedLoadStores) { // align to 8 bytes, we know we are 4 byte aligned to start
testptr(to, 4);
jccb(Assembler::zero, L_fill_32_bytes);
movl(Address(to, 0), value);
addptr(to, 4);
subl(count, 1<<shift);
}
BIND(L_fill_32_bytes);
{
assert( UseSSE >= 2, "supported cpu only" );
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
movdl(xtmp, value); if (UseAVX >= 2 && UseUnalignedLoadStores) {
Label L_check_fill_32_bytes; if (UseAVX > 2) { // Fill 64-byte chunks
Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2;
// If number of bytes to fill < VM_Version::avx3_threshold(), perform fill using AVX2
cmpl(count, VM_Version::avx3_threshold());
jccb(Assembler::below, L_check_fill_64_bytes_avx2);
// // length is too short, just fill qwords //
BIND(L_fill_8_bytes_loop);
movq(Address(to, 0), xtmp);
addptr(to, 8);
BIND(L_fill_8_bytes);
subl(count, 1 << (shift + 1));
jcc(Assembler::greaterEqual, L_fill_8_bytes_loop);
}
} // fill trailing 4 bytes
BIND(L_fill_4_bytes);
testl(count, 1<<shift);
jccb(Assembler::zero, L_fill_2_bytes);
movl(Address(to, 0), value); if (t == T_BYTE || t == T_SHORT) {
Label L_fill_byte;
addptr(to, 4);
BIND(L_fill_2_bytes); // fill trailing 2 bytes
testl(count, 1<<(shift-1));
jccb(Assembler::zero, L_fill_byte);
movw(Address(to, 0), value); if (t == T_BYTE) {
addptr(to, 2);
BIND(L_fill_byte); // fill trailing byte
testl(count, 1);
jccb(Assembler::zero, L_exit);
movb(Address(to, 0), value);
} else {
BIND(L_fill_byte);
}
} else {
BIND(L_fill_2_bytes);
}
BIND(L_exit);
}
void MacroAssembler::evpbroadcast(BasicType type, XMMRegister dst, Register src, int vector_len) { switch(type) { case T_BYTE: case T_BOOLEAN:
evpbroadcastb(dst, src, vector_len); break; case T_SHORT: case T_CHAR:
evpbroadcastw(dst, src, vector_len); break; case T_INT: case T_FLOAT:
evpbroadcastd(dst, src, vector_len); break; case T_LONG: case T_DOUBLE:
evpbroadcastq(dst, src, vector_len); break; default:
fatal("Unhandled type : %s", type2name(type)); break;
}
}
// encode char[] to byte[] in ISO_8859_1 or ASCII //@IntrinsicCandidate //private static int implEncodeISOArray(byte[] sa, int sp, //byte[] da, int dp, int len) { // int i = 0; // for (; i < len; i++) { // char c = StringUTF16.getChar(sa, sp++); // if (c > '\u00FF') // break; // da[dp++] = (byte)c; // } // return i; //} // //@IntrinsicCandidate //private static int implEncodeAsciiArray(char[] sa, int sp, // byte[] da, int dp, int len) { // int i = 0; // for (; i < len; i++) { // char c = sa[sp++]; // if (c >= '\u0080') // break; // da[dp++] = (byte)c; // } // return i; //} void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
XMMRegister tmp3Reg, XMMRegister tmp4Reg, Register tmp5, Register result, bool ascii) {
if (UseAVX >= 2) {
Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit;
movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
movdl(tmp1Reg, tmp5);
vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit);
jmp(L_chars_32_check);
} elseif (UseSSE42Intrinsics) {
movl(tmp5, mask); // create mask to test for Unicode or non-ASCII chars in vector
movdl(tmp1Reg, tmp5);
pshufd(tmp1Reg, tmp1Reg, 0);
jmpb(L_chars_16_check);
}
/** * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. *
*/ void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, Register carry, Register carry2, Register idx, Register jdx, Register yz_idx1, Register yz_idx2, Register tmp, Register tmp3, Register tmp4) {
assert(UseBMI2Instructions, "should be used only when BMI2 is available");
/** * Store the squares of x[], right shifted one bit (divided by 2) into z[] * Preserves x and z and modifies rest of the registers.
*/ void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { // Perform square and right shift by 1 // Handle odd xlen case first, then for even xlen do the following // jlong carry = 0; // for (int j=0, i=0; j < xlen; j+=2, i+=4) { // huge_128 product = x[j:j+1] * x[j:j+1]; // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); // z[i+2:i+3] = (jlong)(product >>> 1); // carry = (jlong)product; // }
xorq(tmp5, tmp5); // carry
xorq(rdxReg, rdxReg);
xorl(tmp1, tmp1); // index for x
xorl(tmp4, tmp4); // index for z
Label L_first_loop, L_first_loop_exit;
testl(xlen, 1);
jccb(Assembler::zero, L_first_loop); //jump if xlen is even
// Square and right shift by 1 the odd element using 32 bit multiply
movl(raxReg, Address(x, tmp1, Address::times_4, 0));
imulq(raxReg, raxReg);
shrq(raxReg, 1);
adcq(tmp5, 0);
movq(Address(z, tmp4, Address::times_4, 0), raxReg);
incrementl(tmp1);
addl(tmp4, 2);
// Square and right shift by 1 the rest using 64 bit multiply
bind(L_first_loop);
cmpptr(tmp1, xlen);
jccb(Assembler::equal, L_first_loop_exit);
/** * Add 64 bit long carry into z[] with carry propagation. * Preserves z and carry register values and modifies rest of registers. *
*/ void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) {
Label L_fourth_loop, L_fourth_loop_exit;
/** * Helper function for mul_add() * Multiply the in[] by int k and add to out[] starting at offset offs using * 128 bit by 32 bit multiply and return the carry in tmp5. * Only quad int aligned length of in[] is operated on in this function. * k is in rdxReg for BMI2Instructions, for others it is in tmp2. * This function preserves out, in and k registers. * len and offset point to the appropriate index in "in" & "out" correspondingly * tmp5 has the carry. * other registers are temporary and are modified. *
*/ void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) {
//Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply //The carry is in tmp5
mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg);
//Multiply the trailing in[] entry using 64 bit by 32 bit, if any
decrementl(len);
jccb(Assembler::negative, L_carry);
decrementl(len);
jccb(Assembler::negative, L_last_in);
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge // context for the registers used, where all instructions below are using 128-bit mode // On EVEX without VL and BW, these instructions will all be AVX.
lea(table, ExternalAddress(StubRoutines::crc_table_addr()));
notl(crc); // ~crc
cmpl(len, 16);
jcc(Assembler::less, L_tail);
// Fold total 512 bits of polynomial on each iteration, // 128 bits per each of 4 parallel streams.
movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32), rscratch1);
// check if there is enough buffer to be able to fold 16B at a time
cmpl(len, 32);
jcc(Assembler::less, L_less_than_32);
// if there is, load the constants
movdqu(xmm10, Address(table, 1 * 16)); //rk1 and rk2 in xmm10
movdl(xmm0, crc); // get the initial crc value
movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
pxor(xmm7, xmm0);
// update the buffer pointer
addl(pos, 16); //update the counter.subtract 32 instead of 16 to save one instruction from the loop
subl(len, 32);
jmp(L_16B_reduction_loop);
bind(L_less_than_32); //mov initial crc to the return value. this is necessary for zero - length buffers.
movl(rax, crc);
testl(len, len);
jcc(Assembler::equal, L_cleanup);
movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext
pxor(xmm7, xmm0); //xor the initial crc value
addl(pos, 16);
subl(len, 16);
movdqu(xmm10, Address(table, 1 * 16)); // rk1 and rk2 in xmm10
jmp(L_get_last_two_xmms);
bind(L_less_than_16_left); //use stack space to load data less than 16 bytes, zero - out the 16B in memory first.
pxor(xmm1, xmm1);
movptr(tmp1, rsp);
movdqu(Address(tmp1, 0 * 16), xmm1);
movdqu(xmm7, Address(rsp, 0));
pxor(xmm7, xmm0); //xor the initial crc value
pslldq(xmm7, 0x7);
}
/** * Compute CRC32 using AVX512 instructions * param crc register containing existing CRC (32-bit) * param buf register pointing to input byte buffer (byte*) * param len register containing number of bytes * param table address of crc or crc32c table * param tmp1 scratch register * param tmp2 scratch register * return rax result register * * This routine is identical for crc32c with the exception of the precomputed constant * table which will be passed as the table argument. The calculation steps are * the same for both variants.
*/ void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register table, Register tmp1, Register tmp2) {
assert_different_registers(crc, buf, len, table, tmp1, tmp2, rax, r12);
// For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge // context for the registers used, where all instructions below are using 128-bit mode // On EVEX without VL and BW, these instructions will all be AVX.
movl(pos, 0);
// check if smaller than 256B
cmpl(len, 256);
jcc(Assembler::less, L_less_than_256);
// load the initial crc value
movdl(xmm10, crc);
// receive the initial 64B data, xor the initial crc value
evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit);
evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit);
evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit);
evbroadcasti32x4(xmm10, Address(table, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4
// at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop // loop will fold 128B at a time until we have 128 + y Bytes of buffer
// fold 128B at a time.This section of the code folds 8 xmm registers in parallel
bind(L_fold_128_B_loop);
addl(pos, 128);
fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64);
fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64);
// at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7
bind(L_fold_128_B_register);
evmovdquq(xmm16, Address(table, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16
evmovdquq(xmm11, Address(table, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0
evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit);
evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit); // save last that has no multiplicand
vextracti64x2(xmm7, xmm4, 3);
// instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop // instead of a cmp instruction, we use the negative flag with the jl instruction
addl(len, 128 - 16);
jcc(Assembler::less, L_final_reduction_for_128);
// get rid of the extra data that was loaded before // load the shift constant
lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr()));
movdqu(xmm0, Address(rax, len));
addl(rax, len);
#ifdef _LP64 // Algorithm 2: Pipelined usage of the CRC32 instruction. // Input: A buffer I of L bytes. // Output: the CRC32C value of the buffer. // Notations: // Write L = 24N + r, with N = floor (L/24). // r = L mod 24 (0 <= r < 24). // Consider I as the concatenation of A|B|C|R, where A, B, C, each, // N quadwords, and R consists of r bytes. // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6,
XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, bool is_pclmulqdq_supported) {
uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS];
Label L_wordByWord;
Label L_byteByByteProlog;
Label L_byteByByte;
Label L_exit;
const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4);
const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5);
assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\"");
} else {
const_or_pre_comp_const_index[0] = 1;
const_or_pre_comp_const_index[1] = 0;
// if length of the string is less than 16, handle it in an old fashioned way
testl(len, -32);
jcc(Assembler::zero, below_threshold);
// First check whether a character is compressible ( <= 0xFF). // Create mask to test for Unicode chars inside zmm vector
movl(result, 0x00FF);
evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
// bail out when there is nothing to be done
testl(tmp5, 0xFFFFFFFF);
jcc(Assembler::zero, post_alignment);
// ~(~0 << len), where len is the # of remaining elements to process
movl(result, 0xFFFFFFFF);
shlxl(result, result, tmp5);
notl(result);
kmovdl(mask2, result);
// All elements in current processed chunk are valid candidates for // compression. Write a truncated byte elements to the memory.
evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
addptr(len, 32);
jcc(Assembler::notZero, copy_32_loop);
bind(copy_loop_tail); // bail out when there is nothing to be done
testl(tmp5, 0xFFFFFFFF);
jcc(Assembler::zero, return_length);
movl(len, tmp5);
// ~(~0 << len), where len is the # of remaining elements to process
movl(result, 0xFFFFFFFF);
shlxl(result, result, len);
notl(result);
// In order to use only one arithmetic operation for the main loop we use // this pre-calculation
andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
andl(len, -32); // vector count
jccb(Assembler::zero, copy_tail);
bind(copy_tail); // bail out when there is nothing to be done
testl(tmp2, -1); // we don't destroy the contents of tmp2 here
jcc(Assembler::zero, done);
// ~(~0 << length), where length is the # of remaining elements to process
movl(tmp3_aliased, -1);
shlxl(tmp3_aliased, tmp3_aliased, tmp2);
notl(tmp3_aliased);
kmovdl(mask, tmp3_aliased);
evpmovzxbw(tmp1, mask, Address(src, 0), Assembler::AVX_512bit);
evmovdquw(Address(dst, 0), mask, tmp1, /*merge*/ true, Assembler::AVX_512bit);
int shift = -1; int avx3threshold = VM_Version::avx3_threshold(); switch(type) { case T_BYTE: shift = 0; break; case T_SHORT: shift = 1; break; case T_INT: shift = 2; break; /* Uncomment when LONG fill stubs are supported. case T_LONG: shift = 3; break;
*/ default:
fatal("Unhandled type: %s\n", type2name(type));
}
if ((avx3threshold != 0) || (MaxVectorSize == 32)) {
void MacroAssembler::cache_wb(Address line)
{ // 64 bit cpus always support clflush
assert(VM_Version::supports_clflush(), "clflush should be available"); bool optimized = VM_Version::supports_clflushopt(); bool no_evict = VM_Version::supports_clwb();
// prefer clwb (writeback without evict) otherwise // prefer clflushopt (potentially parallel writeback with evict) // otherwise fallback on clflush (serial writeback with evict)
if (optimized) { if (no_evict) {
clwb(line);
} else {
clflushopt(line);
}
} else { // no need for fence when using CLFLUSH
clflush(line);
}
}
void MacroAssembler::cache_wbsync(bool is_pre)
{
assert(VM_Version::supports_clflush(), "clflush should be available"); bool optimized = VM_Version::supports_clflushopt(); bool no_evict = VM_Version::supports_clwb();
// pick the correct implementation
if (!is_pre && (optimized || no_evict)) { // need an sfence for post flush when using clflushopt or clwb // otherwise no no need for any synchroniaztion
sfence();
}
}
#endif// _LP64
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { switch (cond) { // Note some conditions are synonyms for others case Assembler::zero: return Assembler::notZero; case Assembler::notZero: return Assembler::zero; case Assembler::less: return Assembler::greaterEqual; case Assembler::lessEqual: return Assembler::greater; case Assembler::greater: return Assembler::lessEqual; case Assembler::greaterEqual: return Assembler::less; case Assembler::below: return Assembler::aboveEqual; case Assembler::belowEqual: return Assembler::above; case Assembler::above: return Assembler::belowEqual; case Assembler::aboveEqual: return Assembler::below; case Assembler::overflow: return Assembler::noOverflow; case Assembler::noOverflow: return Assembler::overflow; case Assembler::negative: return Assembler::positive; case Assembler::positive: return Assembler::negative; case Assembler::parity: return Assembler::noParity; case Assembler::noParity: return Assembler::parity;
}
ShouldNotReachHere(); return Assembler::overflow;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.