/* * Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// WARNING: Initial instruction MUST be 5 bytes or longer so that // NativeJump::patch_verified_entry will be able to patch out the entry // code safely. The push to verify stack depth is ok at 5 bytes, // the frame allocation can be either 3 or 6 bytes. So if we don't do // stack bang then we must use the 6 byte frame allocation even if // we have no frame. :-(
assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect");
assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); // Remove word for return addr
framesize -= wordSize;
stack_bang_size -= wordSize;
// Calls to C2R adapters often do not accept exceptional returns. // We require that their callers must bang for them. But be careful, because // some VM calls (such as call site linkage) can use several kilobytes of // stack. But the stack safety zone should account for that. // See bugs 4446381, 4468289, 4497237. if (stack_bang_size > 0) {
generate_stack_overflow_check(stack_bang_size);
// We always push rbp, so that on return to interpreter rbp, will be // restored correctly and we can correct the stack.
push(rbp); // Save caller's stack pointer into RBP if the frame pointer is preserved. if (PreserveFramePointer) {
mov(rbp, rsp);
} // Remove word for ebp
framesize -= wordSize;
// Create frame if (framesize) {
subptr(rsp, framesize);
}
} else { // Create frame (force generation of a 4 byte immediate value)
subptr_imm32(rsp, framesize);
// Save RBP register now.
framesize -= wordSize;
movptr(Address(rsp, framesize), rbp); // Save caller's stack pointer into RBP if the frame pointer is preserved. if (PreserveFramePointer) {
movptr(rbp, rsp); if (framesize > 0) {
addptr(rbp, framesize);
}
}
}
if (VerifyStackAtCalls) { // Majik cookie to verify stack depth
framesize -= wordSize;
movptr(Address(rsp, framesize), (int32_t)0xbadb100d);
}
#ifndef _LP64 // If method sets FPU control word do it now if (fp_mode_24b) {
fldcw(ExternalAddress(StubRoutines::x86::addr_fpu_cntrl_wrd_24()));
} if (UseSSE >= 2 && VerifyFPU) {
verify_FPU(0, "FPU stack must be clean on entry");
} #endif
#ifdef ASSERT if (VerifyStackAtCalls) {
Label L;
push(rax);
mov(rax, rsp);
andptr(rax, StackAlignmentInBytes-1);
cmpptr(rax, StackAlignmentInBytes-wordSize);
pop(rax);
jcc(Assembler::equal, L);
STOP("Stack is not properly aligned!");
bind(L);
} #endif
if (!is_stub) {
BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); #ifdef _LP64 if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL) { // We put the non-hot code of the nmethod entry barrier out-of-line in a stub.
Label dummy_slow_path;
Label dummy_continuation;
Label* slow_path = &dummy_slow_path;
Label* continuation = &dummy_continuation; if (!Compile::current()->output()->in_scratch_emit_size()) { // Use real labels from actual stub when not emitting code for the purpose of measuring its size
C2EntryBarrierStub* stub = Compile::current()->output()->entry_barrier_table()->add_entry_barrier();
slow_path = &stub->slow_path();
continuation = &stub->continuation();
}
bs->nmethod_entry_barrier(this, slow_path, continuation);
} #else // Don't bother with out-of-line nmethod entry barrier stub for x86_32.
bs->nmethod_entry_barrier(this, NULL /* slow_path */, NULL /* continuation */); #endif
}
}
// Update rtm_counters based on abort status // input: abort_status // rtm_counters (RTMLockingCounters*) // flags are killed void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); if (PrintPreciseRTMLockingStatistics) { for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
Label check_abort;
testl(abort_status, (1<<i));
jccb(Assembler::equal, check_abort);
atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
bind(check_abort);
}
}
}
// Branch if (random & (count-1) != 0), count is 2^n // tmp, scr and flags are killed void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
assert(tmp == rax, "");
assert(scr == rdx, "");
rdtsc(); // modifies EDX:EAX
andptr(tmp, count-1);
jccb(Assembler::notZero, brLabel);
}
// Perform abort ratio calculation, set no_rtm bit if high ratio // input: rtm_counters_Reg (RTMLockingCounters* address) // tmpReg, rtm_counters_Reg and flags are killed void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, Register rtm_counters_Reg,
RTMLockingCounters* rtm_counters,
Metadata* method_data) {
Label L_done, L_check_always_rtm1, L_check_always_rtm2;
if (RTMLockingCalculationDelay > 0) { // Delay calculation
movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()));
testptr(tmpReg, tmpReg);
jccb(Assembler::equal, L_done);
} // Abort ratio calculation only if abort_count > RTMAbortThreshold // Aborted transactions = abort_count * 100 // All transactions = total_count * RTMTotalCountIncrRate // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
Register scrReg = rtm_counters_Reg;
movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
imulptr(scrReg, scrReg, RTMAbortRatio);
cmpptr(tmpReg, scrReg);
jccb(Assembler::below, L_check_always_rtm1); if (method_data != NULL) { // set rtm_state to "no rtm" in MDO
mov_metadata(tmpReg, method_data);
lock();
orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
}
jmpb(L_done);
bind(L_check_always_rtm1); // Reload RTMLockingCounters* address
lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
bind(L_check_always_rtm2);
movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
jccb(Assembler::below, L_done); if (method_data != NULL) { // set rtm_state to "always rtm" in MDO
mov_metadata(tmpReg, method_data);
lock();
orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
}
bind(L_done);
}
// Update counters and perform abort ratio calculation // input: abort_status_Reg // rtm_counters_Reg, flags are killed void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, Register rtm_counters_Reg,
RTMLockingCounters* rtm_counters,
Metadata* method_data, bool profile_rtm) {
assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); // update rtm counters based on rax value at abort // reads abort_status_Reg, updates flags
lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
rtm_counters_update(abort_status_Reg, rtm_counters_Reg); if (profile_rtm) { // Save abort status because abort_status_Reg is used by following code. if (RTMRetryCount > 0) {
push(abort_status_Reg);
}
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); // restore abort status if (RTMRetryCount > 0) {
pop(abort_status_Reg);
}
}
}
// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) // inputs: retry_count_Reg // : abort_status_Reg // output: retry_count_Reg decremented by 1 // flags are killed void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
Label doneRetry;
assert(abort_status_Reg == rax, ""); // The abort reason bits are in eax (see all states in rtmLocking.hpp) // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) // if reason is in 0x6 and retry count != 0 then retry
andptr(abort_status_Reg, 0x6);
jccb(Assembler::zero, doneRetry);
testl(retry_count_Reg, retry_count_Reg);
jccb(Assembler::zero, doneRetry);
pause();
decrementl(retry_count_Reg);
jmp(retryLabel);
bind(doneRetry);
}
// Spin and retry if lock is busy, // inputs: box_Reg (monitor address) // : retry_count_Reg // output: retry_count_Reg decremented by 1 // : clear z flag if retry count exceeded // tmp_Reg, scr_Reg, flags are killed void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
Label SpinLoop, SpinExit, doneRetry; int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
bind(SpinExit);
jmp(retryLabel);
bind(doneRetry);
incrementl(retry_count_Reg); // clear z flag
}
// Use RTM for normal stack locks // Input: objReg (object to lock) void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, Register retry_on_abort_count_Reg,
RTMLockingCounters* stack_rtm_counters,
Metadata* method_data, bool profile_rtm,
Label& DONE_LABEL, Label& IsInflated) {
assert(UseRTMForStackLocks, "why call this otherwise?");
assert(tmpReg == rax, "");
assert(scrReg == rdx, "");
Label L_rtm_retry, L_decrement_retry, L_on_abort;
if (RTMRetryCount > 0) {
movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
bind(L_rtm_retry);
}
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral
jcc(Assembler::notZero, IsInflated);
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
Label L_noincrement; if (RTMTotalCountIncrRate > 1) { // tmpReg, scrReg and flags are killed
branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
}
assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
bind(L_noincrement);
}
xbegin(L_on_abort);
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
andptr(tmpReg, markWord::lock_mask_in_place); // look at 2 lock bits
cmpptr(tmpReg, markWord::unlocked_value); // bits = 01 unlocked
jcc(Assembler::equal, DONE_LABEL); // all done if unlocked
Register abort_status_Reg = tmpReg; // status of abort is stored in RAX if (UseRTMXendForLockBusy) {
xend();
movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)
jmp(L_decrement_retry);
} else {
xabort(0);
}
bind(L_on_abort); if (PrintPreciseRTMLockingStatistics || profile_rtm) {
rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
}
bind(L_decrement_retry); if (RTMRetryCount > 0) { // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
}
}
// Appears unlocked - try to swing _owner from null to non-null. // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. #ifdef _LP64 Register threadReg = r15_thread; #else
get_thread(scrReg); Register threadReg = scrReg; #endif
lock();
cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
if (RTMRetryCount > 0) { // success done else retry
jccb(Assembler::equal, DONE_LABEL) ;
bind(L_decrement_retry); // Spin and retry if lock is busy.
rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
} else {
bind(L_decrement_retry);
}
}
#endif// INCLUDE_RTM_OPT
// fast_lock and fast_unlock used by C2
// Because the transitions from emitted code to the runtime // monitorenter/exit helper stubs are so slow it's critical that // we inline both the stack-locking fast path and the inflated fast path. // // See also: cmpFastLock and cmpFastUnlock. // // What follows is a specialized inline transliteration of the code // in enter() and exit(). If we're concerned about I$ bloat another // option would be to emit TrySlowEnter and TrySlowExit methods // at startup-time. These methods would accept arguments as // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure // indications in the icc.ZFlag. fast_lock and fast_unlock would simply // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. // In practice, however, the # of lock sites is bounded and is usually small. // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer // if the processor uses simple bimodal branch predictors keyed by EIP // Since the helper routines would be called from multiple synchronization // sites. // // An even better approach would be write "MonitorEnter()" and "MonitorExit()" // in java - using j.u.c and unsafe - and just bind the lock and unlock sites // to those specialized methods. That'd give us a mostly platform-independent // implementation that the JITs could optimize and inline at their pleasure. // Done correctly, the only time we'd need to cross to native could would be // to park() or unpark() threads. We'd also need a few more unsafe operators // to (a) prevent compiler-JIT reordering of non-volatile accesses, and // (b) explicit barriers or fence operations. // // TODO: // // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. // Given TLAB allocation, Self is usually manifested in a register, so passing it into // the lock operators would typically be faster than reifying Self. // // * Ideally I'd define the primitives as: // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED // Unfortunately ADLC bugs prevent us from expressing the ideal form. // Instead, we're stuck with a rather awkward and brittle register assignments below. // Furthermore the register assignments are overconstrained, possibly resulting in // sub-optimal code near the synchronization site. // // * Eliminate the sp-proximity tests and just use "== Self" tests instead. // Alternately, use a better sp-proximity test. // // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. // Either one is sufficient to uniquely identify a thread. // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. // // * Intrinsify notify() and notifyAll() for the common cases where the // object is locked by the calling thread but the waitlist is empty. // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). // // * use jccb and jmpb instead of jcc and jmp to improve code density. // But beware of excessive branch density on AMD Opterons. // // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success // or failure of the fast path. If the fast path fails then we pass // control to the slow path, typically in C. In fast_lock and // fast_unlock we often branch to DONE_LABEL, just to find that C2 // will emit a conditional branch immediately after the node. // So we have branches to branches and lots of ICC.ZF games. // Instead, it might be better to have C2 pass a "FailureLabel" // into fast_lock and fast_unlock. In the case of success, control // will drop through the node. ICC.ZF is undefined at exit. // In the case of failure, the node will branch directly to the // FailureLabel
// Possible cases that we'll encounter in fast_lock // ------------------------------------------------ // * Inflated // -- unlocked // -- Locked // = by self // = by other // * neutral // * stack-locked // -- by self // = sp-proximity test hits // = sp-proximity test generates false-negative // -- by other //
// Recursive locking. // The object is stack-locked: markword contains stack pointer to BasicLock. // Locked by current thread if difference with current SP is less than one page.
subptr(tmpReg, rsp); // Next instruction set ZFlag == 1 (Success) if difference is less then one page.
andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
movptr(Address(boxReg, 0), tmpReg);
} else { // Clear ZF so that we take the slow path at the DONE label. objReg is known to be not 0.
testptr(objReg, objReg);
}
jmp(DONE_LABEL);
bind(IsInflated); // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
#if INCLUDE_RTM_OPT // Use the same RTM locking code in 32- and 64-bit VM. if (use_rtm) {
rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
rtm_counters, method_data, profile_rtm, DONE_LABEL);
} else { #endif// INCLUDE_RTM_OPT
#ifndef _LP64 // The object is inflated.
// boxReg refers to the on-stack BasicLock in the current frame. // We'd like to write: // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. // This is convenient but results a ST-before-CAS penalty. The following CAS suffers // additional latency as we have another ST in the store buffer that must drain.
// avoid ST-before-CAS // register juggle because we need tmpReg for cmpxchgptr below
movptr(scrReg, boxReg);
movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
// Appears unlocked - try to swing _owner from null to non-null. // Ideally, I'd manifest "Self" with get_thread and then attempt // to CAS the register containing Self into m->Owner. // But we don't have enough registers, so instead we can either try to CAS // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds // we later store "Self" into m->Owner. Transiently storing a stack address // (rsp or the address of the box) into m->owner is harmless. // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
lock();
cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 // If we weren't able to swing _owner from NULL to the BasicLock // then take the slow path.
jccb (Assembler::notZero, NO_COUNT); // update _owner from BasicLock to thread
get_thread (scrReg); // beware: clobbers ICCs
movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
// If the CAS fails we can either retry or pass control to the slow path. // We use the latter tactic. // Pass the CAS result in the icc.ZFlag into DONE_LABEL // If the CAS was successful ... // Self has acquired the lock // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. // Intentional fall-through into DONE_LABEL ... #else// _LP64 // It's inflated and we use scrReg for ObjectMonitor* in this section.
movq(scrReg, tmpReg);
xorq(tmpReg, tmpReg);
lock();
cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); // Unconditionally set box->_displaced_header = markWord::unused_mark(). // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
movptr(Address(boxReg, 0), checked_cast<int32_t>(markWord::unused_mark().value())); // Propagate ICC.ZF from CAS above into DONE_LABEL.
jccb(Assembler::equal, COUNT); // CAS above succeeded; propagate ZF = 1 (success)
cmpptr(r15_thread, rax); // Check if we are already the owner (recursive lock)
jccb(Assembler::notEqual, NO_COUNT); // If not recursive, ZF = 0 at this point (fail)
incq(Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
xorq(rax, rax); // Set ZF = 1 (success) for recursive lock, denoting locking success #endif// _LP64 #if INCLUDE_RTM_OPT
} // use_rtm() #endif
bind(DONE_LABEL);
// ZFlag == 1 count in fast path // ZFlag == 0 count in slow path
jccb(Assembler::notZero, NO_COUNT); // jump if ZFlag == 0
bind(COUNT); // Count monitors in fast path #ifndef _LP64
get_thread(tmpReg);
incrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); #else// _LP64
incrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); #endif
xorl(tmpReg, tmpReg); // Set ZF == 1
bind(NO_COUNT);
// At NO_COUNT the icc ZFlag is set as follows ... // fast_unlock uses the same protocol. // ZFlag == 1 -> Success // ZFlag == 0 -> Failure - force control through the slow path
}
// obj: object to unlock // box: box address (displaced header location), killed. Must be EAX. // tmp: killed, cannot be obj nor box. // // Some commentary on balanced locking: // // fast_lock and fast_unlock are emitted only for provably balanced lock sites. // Methods that don't have provably balanced locking are forced to run in the // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. // The interpreter provides two properties: // I1: At return-time the interpreter automatically and quietly unlocks any // objects acquired the current activation (frame). Recall that the // interpreter maintains an on-stack list of locks currently held by // a frame. // I2: If a method attempts to unlock an object that is not held by the // the frame the interpreter throws IMSX. // // Lets say A(), which has provably balanced locking, acquires O and then calls B(). // B() doesn't have provably balanced locking so it runs in the interpreter. // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O // is still locked by A(). // // The only other source of unbalanced locking would be JNI. The "Java Native Interface: // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter // should not be unlocked by "normal" java-level locking and vice-versa. The specification // doesn't specify what will occur if a program engages in such mixed-mode locking, however. // Arguably given that the spec legislates the JNI case as undefined our implementation // could reasonably *avoid* checking owner in fast_unlock(). // In the interest of performance we elide m->Owner==Self check in unlock. // A perfectly viable alternative is to elide the owner check except when // Xcheck:jni is enabled.
// Despite our balanced locking property we still check that m->_owner == Self // as java routines or native JNI code called by this thread might // have released the lock. // Refer to the comments in synchronizer.cpp for how we might encode extra // state in _succ so we can avoid fetching EntryList|cxq. // // If there's no contention try a 1-0 exit. That is, exit without // a costly MEMBAR or CAS. See synchronizer.cpp for details on how // we detect and recover from the race that the 1-0 exit admits. // // Conceptually fast_unlock() must execute a STST|LDST "release" barrier // before it STs null into _owner, releasing the lock. Updates // to data protected by the critical section must be visible before // we drop the lock (and thus before any other thread could acquire // the lock and observe the fields protected by the lock). // IA32's memory-model is SPO, so STs are ordered with respect to // each other and there's no need for an explicit barrier (fence). // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. #ifndef _LP64 // Note that we could employ various encoding schemes to reduce // the number of loads below (currently 4) to just 2 or 3. // Refer to the comments in synchronizer.cpp. // In practice the chain of fetches doesn't seem to impact performance, however.
xorptr(boxReg, boxReg);
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
jccb (Assembler::notZero, DONE_LABEL);
movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
jccb (Assembler::notZero, DONE_LABEL);
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
jmpb (DONE_LABEL); #else// _LP64 // It's inflated
Label CheckSucc, LNotRecursive, LSuccess, LGoSlowPath;
bind(LNotRecursive);
movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
jccb (Assembler::notZero, CheckSucc); // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
jmpb (DONE_LABEL);
// Try to avoid passing control into the slow_path ...
bind (CheckSucc);
// The following optional optimization can be elided if necessary // Effectively: if (succ == null) goto slow path // The code reduces the window for a race, however, // and thus benefits performance.
cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), NULL_WORD);
jccb (Assembler::zero, LGoSlowPath);
xorptr(boxReg, boxReg); // Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
// Memory barrier/fence // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. // This is faster on Nehalem and AMD Shanghai/Barcelona. // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences // We might also restructure (ST Owner=0;barrier;LD _Succ) to // (mov box,0; xchgq box, &m->Owner; LD _succ) .
lock(); addl(Address(rsp, 0), 0);
// Rare inopportune interleaving - race. // The successor vanished in the small window above. // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. // We need to ensure progress and succession. // Try to reacquire the lock. // If that fails then the new owner is responsible for succession and this // thread needs to take no further action and can exit via the fast path (success). // If the re-acquire succeeds then pass control into the slow path. // As implemented, this latter mode is horrible because we generated more // coherence traffic on the lock *and* artificially extended the critical section // length while by virtue of passing control into the slow path.
// box is really RAX -- the following CMPXCHG depends on that binding // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
lock();
cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); // There's no successor so we tried to regrab the lock. // If that didn't work, then another thread grabbed the // lock so we're done (and exit was a success).
jccb (Assembler::notEqual, LSuccess); // Intentional fall-through into slow path
bind (LGoSlowPath);
orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
jmpb (DONE_LABEL);
bind (LSuccess);
testl (boxReg, 0); // set ICC.ZF=1 to indicate success
jmpb (DONE_LABEL);
#endif if (!UseHeavyMonitors) {
bind (Stacked);
movptr(tmpReg, Address (boxReg, 0)); // re-fetch
lock();
cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box // Intentional fall-thru into DONE_LABEL
}
bind(DONE_LABEL);
// ZFlag == 1 count in fast path // ZFlag == 0 count in slow path
jccb(Assembler::notZero, NO_COUNT);
bind(COUNT); // Count monitors in fast path #ifndef _LP64
get_thread(tmpReg);
decrementl(Address(tmpReg, JavaThread::held_monitor_count_offset())); #else// _LP64
decrementq(Address(r15_thread, JavaThread::held_monitor_count_offset())); #endif
xorl(tmpReg, tmpReg); // Set ZF == 1
bind(NO_COUNT);
}
//------------------------------------------------------------------------------------------- // Generic instructions support for use in .ad files C2 code generation
void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src) { if (dst != src) {
movdqu(dst, src);
} if (opcode == Op_AbsVD) {
andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), noreg);
} else {
assert((opcode == Op_NegVD),"opcode should be Op_NegD");
xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), noreg);
}
}
void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { if (opcode == Op_AbsVD) {
vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, noreg);
} else {
assert((opcode == Op_NegVD),"opcode should be Op_NegD");
vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, noreg);
}
}
void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src) { if (dst != src) {
movdqu(dst, src);
} if (opcode == Op_AbsVF) {
andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), noreg);
} else {
assert((opcode == Op_NegVF),"opcode should be Op_NegF");
xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), noreg);
}
}
void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len) { if (opcode == Op_AbsVF) {
vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, noreg);
} else {
assert((opcode == Op_NegVF),"opcode should be Op_NegF");
vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, noreg);
}
}
void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { switch (opcode) { case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems case Op_LShiftVL: psllq(dst, shift); break; case Op_URShiftVL: psrlq(dst, shift); break;
void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) { // The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64. int offset = exact_log2(type2aelembytes(bt)) << 6; if (is_floating_point_type(bt)) {
offset += 128;
}
ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
load_vector(dst, addr, vlen_in_bytes);
}
// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { int vector_len = Assembler::AVX_128bit;
switch (opcode) { case Op_AndReductionV: pand(dst, src); break; case Op_OrReductionV: por (dst, src); break; case Op_XorReductionV: pxor(dst, src); break; case Op_MinReductionV: switch (typ) { case T_BYTE: pminsb(dst, src); break; case T_SHORT: pminsw(dst, src); break; case T_INT: pminsd(dst, src); break; case T_LONG: assert(UseAVX > 2, "required");
vpminsq(dst, dst, src, Assembler::AVX_128bit); break; default: assert(false, "wrong type");
} break; case Op_MaxReductionV: switch (typ) { case T_BYTE: pmaxsb(dst, src); break; case T_SHORT: pmaxsw(dst, src); break; case T_INT: pmaxsd(dst, src); break; case T_LONG: assert(UseAVX > 2, "required");
vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; default: assert(false, "wrong type");
} break; case Op_AddReductionVF: addss(dst, src); break; case Op_AddReductionVD: addsd(dst, src); break; case Op_AddReductionVI: switch (typ) { case T_BYTE: paddb(dst, src); break; case T_SHORT: paddw(dst, src); break; case T_INT: paddd(dst, src); break; default: assert(false, "wrong type");
} break; case Op_AddReductionVL: paddq(dst, src); break; case Op_MulReductionVF: mulss(dst, src); break; case Op_MulReductionVD: mulsd(dst, src); break; case Op_MulReductionVI: switch (typ) { case T_SHORT: pmullw(dst, src); break; case T_INT: pmulld(dst, src); break; default: assert(false, "wrong type");
} break; case Op_MulReductionVL: assert(UseAVX > 2, "required");
evpmullq(dst, dst, src, vector_len); break; default: assert(false, "wrong opcode");
}
}
void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { int vector_len = Assembler::AVX_256bit;
switch (opcode) { case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; case Op_MinReductionV: switch (typ) { case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; case T_INT: vpminsd(dst, src1, src2, vector_len); break; case T_LONG: assert(UseAVX > 2, "required");
vpminsq(dst, src1, src2, vector_len); break; default: assert(false, "wrong type");
} break; case Op_MaxReductionV: switch (typ) { case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; case T_LONG: assert(UseAVX > 2, "required");
vpmaxsq(dst, src1, src2, vector_len); break; default: assert(false, "wrong type");
} break; case Op_AddReductionVI: switch (typ) { case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; case T_INT: vpaddd(dst, src1, src2, vector_len); break; default: assert(false, "wrong type");
} break; case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; case Op_MulReductionVI: switch (typ) { case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; case T_INT: vpmulld(dst, src1, src2, vector_len); break; default: assert(false, "wrong type");
} break; case Op_MulReductionVL: evpmullq(dst, src1, src2, vector_len); break; default: assert(false, "wrong opcode");
}
}
void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
XMMRegister dst, XMMRegister src,
XMMRegister vtmp1, XMMRegister vtmp2) { switch (opcode) { case Op_AddReductionVF: case Op_MulReductionVF:
reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); break;
case Op_AddReductionVD: case Op_MulReductionVD:
reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); break;
void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { switch (bt) { case T_BYTE: pextrb(dst, src, idx); break; case T_SHORT: pextrw(dst, src, idx); break; case T_INT: pextrd(dst, src, idx); break; case T_LONG: pextrq(dst, src, idx); break;
default:
assert(false,"Should not reach here."); break;
}
}
XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { int esize = type2aelembytes(typ); int elem_per_lane = 16/esize; int lane = elemindex / elem_per_lane; int eindex = elemindex % elem_per_lane;
// Reload substr for rescan, this code // is executed only for large substrings (> 8 chars)
bind(RELOAD_SUBSTR); if (ae == StrIntrinsicNode::UL) {
pmovzxbw(vec, Address(str2, 0));
} else {
movdqu(vec, Address(str2, 0));
}
negptr(cnt2); // Jumped here with negative cnt2, convert to positive
bind(RELOAD_STR); // We came here after the beginning of the substring was // matched but the rest of it was not so we need to search // again. Start from the next element after the previous match.
// cnt2 is number of substring reminding elements and // cnt1 is number of string reminding elements when cmp failed. // Restored cnt1 = cnt1 - cnt2 + int_cnt2
subl(cnt1, cnt2);
addl(cnt1, int_cnt2);
movl(cnt2, int_cnt2); // Now restore cnt2
decrementl(cnt1); // Shift to next element
cmpl(cnt1, cnt2);
jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
addptr(result, (1<<scale1));
} // (int_cnt2 > 8)
// Scan string for start of substr in 16-byte vectors
bind(SCAN_TO_SUBSTR);
pcmpestri(vec, Address(result, 0), mode);
jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
subl(cnt1, stride);
jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
cmpl(cnt1, cnt2);
jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
addptr(result, 16);
jmpb(SCAN_TO_SUBSTR);
// Found a potential substr
bind(FOUND_CANDIDATE); // Matched whole vector if first element matched (tmp(rcx) == 0). if (int_cnt2 == stride) {
jccb(Assembler::overflow, RET_FOUND); // OF == 1
} else { // int_cnt2 > 8
jccb(Assembler::overflow, FOUND_SUBSTR);
} // After pcmpestri tmp(rcx) contains matched element index // Compute start addr of substr
lea(result, Address(result, tmp, scale1));
// Make sure string is still long enough
subl(cnt1, tmp);
cmpl(cnt1, cnt2); if (int_cnt2 == stride) {
jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
} else { // int_cnt2 > 8
jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
} // Left less then substring.
bind(RET_NOT_FOUND);
movl(result, -1);
jmp(EXIT);
if (int_cnt2 > stride) { // This code is optimized for the case when whole substring // is matched if its head is matched.
bind(MATCH_SUBSTR_HEAD);
pcmpestri(vec, Address(result, 0), mode); // Reload only string if does not match
jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
Label CONT_SCAN_SUBSTR; // Compare the rest of substring (> 8 chars).
bind(FOUND_SUBSTR); // First 8 chars are already matched.
negptr(cnt2);
addptr(cnt2, stride);
bind(SCAN_SUBSTR);
subl(cnt1, stride);
cmpl(cnt2, -stride); // Do not read beyond substring
jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); // Back-up strings to avoid reading beyond substring: // cnt1 = cnt1 - cnt2 + 8
addl(cnt1, cnt2); // cnt2 is negative
addl(cnt1, stride);
movl(cnt2, stride); negptr(cnt2);
bind(CONT_SCAN_SUBSTR); if (int_cnt2 < (int)G) { int tail_off1 = int_cnt2<<scale1; int tail_off2 = int_cnt2<<scale2; if (ae == StrIntrinsicNode::UL) {
pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
} else {
movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
}
pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
} else { // calculate index in register to avoid integer overflow (int_cnt2*2)
movl(tmp, int_cnt2);
addptr(tmp, cnt2); if (ae == StrIntrinsicNode::UL) {
pmovzxbw(vec, Address(str2, tmp, scale2, 0));
} else {
movdqu(vec, Address(str2, tmp, scale2, 0));
}
pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
} // Need to reload strings pointers if not matched whole vector
jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
addptr(cnt2, stride);
jcc(Assembler::negative, SCAN_SUBSTR); // Fall through if found full substring
} // (int_cnt2 > 8)
bind(RET_FOUND); // Found result if we matched full small substring. // Compute substr offset
subptr(result, str1); if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
shrl(result, 1); // index
}
bind(EXIT);
} // string_indexofC8
// Small strings are loaded through stack if they cross page boundary. void C2_MacroAssembler::string_indexof(Register str1, Register str2, Register cnt1, Register cnt2, int int_cnt2, Register result,
XMMRegister vec, Register tmp, int ae) {
ShortBranchVerifier sbv(this);
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
// // int_cnt2 is length of small (< 8 chars) constant substring // or (-1) for non constant substring in which case its length // is in cnt2 register. // // Note, inline_string_indexOf() generates checks: // if (substr.count > string.count) return -1; // if (substr.count == 0) return 0; // int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); // This method uses the pcmpestri instruction with bound registers // inputs: // xmm - substring // rax - substring length (elements count) // mem - scanned string // rdx - string length (elements count) // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) // outputs: // rcx - matched index in string
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
{ //======================================================== // We don't know where these strings are located // and we can't read beyond them. Load them through stack.
Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
movptr(tmp, rsp); // save old SP
if (int_cnt2 > 0) { // small (< 8 chars) constant substring if (int_cnt2 == (1>>scale2)) { // One byte
assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
load_unsigned_byte(result, Address(str2, 0));
movdl(vec, result); // move 32 bits
} elseif (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes // Not enough header space in 32-bit VM: 12+3 = 15.
movl(result, Address(str2, -1));
shrl(result, 8);
movdl(vec, result); // move 32 bits
} elseif (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
load_unsigned_short(result, Address(str2, 0));
movdl(vec, result); // move 32 bits
} elseif (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
movdl(vec, Address(str2, 0)); // move 32 bits
} elseif (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
movq(vec, Address(str2, 0)); // move 64 bits
} else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) // Array header size is 12 bytes in 32-bit VM // + 6 bytes for 3 chars == 18 bytes, // enough space to load vec and shift.
assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); if (ae == StrIntrinsicNode::UL) { int tail_off = int_cnt2-8;
pmovzxbw(vec, Address(str2, tail_off));
psrldq(vec, -2*tail_off);
} else { int tail_off = int_cnt2*(1<<scale2);
movdqu(vec, Address(str2, tail_off-16));
psrldq(vec, 16-tail_off);
}
}
} else { // not constant substring
cmpl(cnt2, stride);
jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
// We can read beyond string if srt+16 does not cross page boundary // since heaps are aligned and mapped by pages.
assert(os::vm_page_size() < (int)G, "default page should be small");
movl(result, str2); // We need only low 32 bits
andl(result, (os::vm_page_size()-1));
cmpl(result, (os::vm_page_size()-16));
jccb(Assembler::belowEqual, CHECK_STR);
// Move small strings to stack to allow load 16 bytes into vec.
subptr(rsp, 16); int stk_offset = wordSize-(1<<scale2);
push(cnt2);
if (int_cnt2 < 0) { // Only for non constant substring
jmpb(SCAN_TO_SUBSTR);
// SP saved at sp+0 // String saved at sp+1*wordSize // Substr saved at sp+2*wordSize // Substr count saved at sp+3*wordSize
// Reload substr for rescan, this code // is executed only for large substrings (> 8 chars)
bind(RELOAD_SUBSTR);
movptr(str2, Address(rsp, 2*wordSize));
movl(cnt2, Address(rsp, 3*wordSize)); if (ae == StrIntrinsicNode::UL) {
pmovzxbw(vec, Address(str2, 0));
} else {
movdqu(vec, Address(str2, 0));
} // We came here after the beginning of the substring was // matched but the rest of it was not so we need to search // again. Start from the next element after the previous match.
subptr(str1, result); // Restore counter if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
shrl(str1, 1);
}
addl(cnt1, str1);
decrementl(cnt1); // Shift to next element
cmpl(cnt1, cnt2);
jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
addptr(result, (1<<scale1));
} // non constant
// Scan string for start of substr in 16-byte vectors
bind(SCAN_TO_SUBSTR);
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
pcmpestri(vec, Address(result, 0), mode);
jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
subl(cnt1, stride);
jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
cmpl(cnt1, cnt2);
jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
addptr(result, 16);
bind(ADJUST_STR);
cmpl(cnt1, stride); // Do not read beyond string
jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); // Back-up string to avoid reading beyond string.
lea(result, Address(result, cnt1, scale1, -16));
movl(cnt1, stride);
jmpb(SCAN_TO_SUBSTR);
// Found a potential substr
bind(FOUND_CANDIDATE); // After pcmpestri tmp(rcx) contains matched element index
// Make sure string is still long enough
subl(cnt1, tmp);
cmpl(cnt1, cnt2);
jccb(Assembler::greaterEqual, FOUND_SUBSTR); // Left less then substring.
bind(FOUND_SUBSTR); // Compute start addr of substr
lea(result, Address(result, tmp, scale1)); if (int_cnt2 > 0) { // Constant substring // Repeat search for small substring (< 8 chars) // from new point without reloading substring. // Have to check that we don't read beyond string.
cmpl(tmp, stride-int_cnt2);
jccb(Assembler::greater, ADJUST_STR); // Fall through if matched whole substring.
} else { // non constant
assert(int_cnt2 == -1, "should be != 0");
addl(tmp, cnt2); // Found result if we matched whole substring.
cmpl(tmp, stride);
jcc(Assembler::lessEqual, RET_FOUND);
// Repeat search for small substring (<= 8 chars) // from new point 'str1' without reloading substring.
cmpl(cnt2, stride); // Have to check that we don't read beyond string.
jccb(Assembler::lessEqual, ADJUST_STR);
Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; // Compare the rest of substring (> 8 chars).
movptr(str1, result);
cmpl(tmp, cnt2); // First 8 chars are already matched.
jccb(Assembler::equal, CHECK_NEXT);
bind(SCAN_SUBSTR);
pcmpestri(vec, Address(str1, 0), mode); // Need to reload strings pointers if not matched whole vector
jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
bind(CHECK_NEXT);
subl(cnt2, stride);
jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
addptr(str1, 16); if (ae == StrIntrinsicNode::UL) {
addptr(str2, 8);
} else {
addptr(str2, 16);
}
subl(cnt1, stride);
cmpl(cnt2, stride); // Do not read beyond substring
jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); // Back-up strings to avoid reading beyond substring.
// Compare strings, used for char[] and byte[]. void C2_MacroAssembler::string_compare(Register str1, Register str2, Register cnt1, Register cnt2, Register result,
XMMRegister vec1, int ae, KRegister mask) {
ShortBranchVerifier sbv(this);
Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 int stride, stride2, adr_stride, adr_stride1, adr_stride2; int stride2x2 = 0x40;
Address::ScaleFactor scale = Address::no_scale;
Address::ScaleFactor scale1 = Address::no_scale;
Address::ScaleFactor scale2 = Address::no_scale;
if (ae != StrIntrinsicNode::LL) {
stride2x2 = 0x20;
}
if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
shrl(cnt2, 1);
} // Compute the minimum of the string lengths and the // difference of the string lengths (stack). // Do the conditional move stuff
movl(result, cnt1);
subl(cnt1, cnt2);
push(cnt1);
cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
if (ae == StrIntrinsicNode::UU) { // Divide length by 2 to get number of chars
shrl(cnt2, 1);
}
cmpl(cnt2, 1);
jcc(Assembler::equal, LENGTH_DIFF_LABEL);
// Check if the strings start at the same location and setup scale and stride if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
cmpptr(str1, str2);
jcc(Assembler::equal, LENGTH_DIFF_LABEL); if (ae == StrIntrinsicNode::LL) {
scale = Address::times_1;
stride = 16;
} else {
scale = Address::times_2;
stride = 8;
}
} else {
scale1 = Address::times_1;
scale2 = Address::times_2; // scale not used
stride = 8;
}
if (UseAVX >= 2 && UseSSE42Intrinsics) {
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
Label COMPARE_TAIL_LONG;
Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
int pcmpmask = 0x19; if (ae == StrIntrinsicNode::LL) {
pcmpmask &= ~0x01;
}
// Setup to compare 16-chars (32-bytes) vectors, // start from first character again because it has aligned address. if (ae == StrIntrinsicNode::LL) {
stride2 = 32;
} else {
stride2 = 16;
} if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
adr_stride = stride << scale;
} else {
adr_stride1 = 8; //stride << scale1;
adr_stride2 = 16; //stride << scale2;
}
assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); // rax and rdx are used by pcmpestri as elements counters
movl(result, cnt2);
andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
jcc(Assembler::zero, COMPARE_TAIL_LONG);
// Compare the characters at index in cnt1
bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
subl(result, cnt2);
jmp(POP_LABEL);
bind(COMPARE_TAIL); // limit is zero
movl(cnt2, result); // Fallthru to tail compare
} // Shift str2 and str1 to the end of the arrays, negate min if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
lea(str1, Address(str1, cnt2, scale));
lea(str2, Address(str2, cnt2, scale));
} else {
lea(str1, Address(str1, cnt2, scale1));
lea(str2, Address(str2, cnt2, scale2));
}
decrementl(cnt2); // first character was compared already
negptr(cnt2);
// Compare the rest of the elements
bind(WHILE_HEAD_LABEL);
load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
subl(result, cnt1);
jccb(Assembler::notZero, POP_LABEL);
increment(cnt2);
jccb(Assembler::notZero, WHILE_HEAD_LABEL);
// Strings are equal up to min length. Return the length difference.
bind(LENGTH_DIFF_LABEL);
pop(result); if (ae == StrIntrinsicNode::UU) { // Divide diff by 2 to get number of chars
sarl(result, 1);
}
jmpb(DONE_LABEL);
#ifdef _LP64 if (VM_Version::supports_avx512vlbw()) {
bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
kmovql(cnt1, mask);
notq(cnt1);
bsfq(cnt2, cnt1); if (ae != StrIntrinsicNode::LL) { // Divide diff by 2 to get number of chars
sarl(cnt2, 1);
}
addq(result, cnt2); if (ae == StrIntrinsicNode::LL) {
load_unsigned_byte(cnt1, Address(str2, result));
load_unsigned_byte(result, Address(str1, result));
} elseif (ae == StrIntrinsicNode::UU) {
load_unsigned_short(cnt1, Address(str2, result, scale));
load_unsigned_short(result, Address(str1, result, scale));
} else {
load_unsigned_short(cnt1, Address(str2, result, scale2));
load_unsigned_byte(result, Address(str1, result, scale1));
}
subl(result, cnt1);
jmpb(POP_LABEL);
}//if (VM_Version::supports_avx512vlbw()) #endif// _LP64
// Discard the stored length difference
bind(POP_LABEL);
pop(cnt1);
// That's it
bind(DONE_LABEL); if(ae == StrIntrinsicNode::UL) {
negl(result);
}
}
// Search for Non-ASCII character (Negative byte value) in a byte array, // return the index of the first such character, otherwise the length // of the array segment searched. // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java // @IntrinsicCandidate // public static int countPositives(byte[] ba, int off, int len) { // for (int i = off; i < off + len; i++) { // if (ba[i] < 0) { // return i - off; // } // } // return len; // } void C2_MacroAssembler::count_positives(Register ary1, Register len, Register result, Register tmp1,
XMMRegister vec1, XMMRegister vec2, KRegister mask1, KRegister mask2) { // rsi: byte array // rcx: len // rax: result
ShortBranchVerifier sbv(this);
assert_different_registers(ary1, len, result, tmp1);
assert_different_registers(vec1, vec2);
Label ADJUST, TAIL_ADJUST, DONE, TAIL_START, CHAR_ADJUST, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
bind(test_tail); // bail out when there is nothing to be done
testl(tmp1, -1);
jcc(Assembler::zero, DONE);
// ~(~0 << len) applied up to two times (for 32-bit scenario) #ifdef _LP64
mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
shlxq(tmp3_aliased, tmp3_aliased, tmp1);
notq(tmp3_aliased);
kmovql(mask2, tmp3_aliased); #else
Label k_init;
jmp(k_init);
// We could not read 64-bits from a general purpose register thus we move // data required to compose 64 1's to the instruction stream // We emit 64 byte wide series of elements from 0..63 which later on would // be used as a compare targets with tail count contained in tmp1 register. // Result would be a k register having tmp1 consecutive number or 1 // counting from least significant bit.
address tmp = pc();
emit_int64(0x0706050403020100);
emit_int64(0x0F0E0D0C0B0A0908);
emit_int64(0x1716151413121110);
emit_int64(0x1F1E1D1C1B1A1918);
emit_int64(0x2726252423222120);
emit_int64(0x2F2E2D2C2B2A2928);
emit_int64(0x3736353433323130);
emit_int64(0x3F3E3D3C3B3A3938);
bind(k_init);
lea(len, InternalAddress(tmp)); // create mask to test for negative byte inside a vector
evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
evpcmpgtb(mask2, vec1, Address(len, 0), Assembler::AVX_512bit);
bind(BREAK_LOOP); // At least one byte in the last 64 bytes is negative. // Set up to look at the last 64 bytes as if they were a tail
lea(ary1, Address(ary1, len, Address::times_1));
addptr(result, len); // Ignore the very last byte: if all others are positive, // it must be negative, so we can skip right to the 2+1 byte // end comparison at this point
orl(result, 63);
movl(len, 63); // Fallthru to tail compare
} else {
if (UseAVX >= 2 && UseSSE >= 2) { // With AVX2, use 32-byte vector compare
Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
testl(result, 0x0000001f); // any bytes remaining?
jcc(Assembler::zero, DONE);
// Quick test using the already prepared vector mask
movl(len, result);
andl(len, 0x0000001f);
vmovdqu(vec1, Address(ary1, len, Address::times_1, -32));
vptest(vec1, vec2);
jcc(Assembler::zero, DONE); // There are zeros, jump to the tail to determine exactly where
jmpb(TAIL_START);
bind(BREAK_LOOP); // At least one byte in the last 32-byte vector is negative. // Set up to look at the last 32 bytes as if they were a tail
lea(ary1, Address(ary1, len, Address::times_1));
addptr(result, len); // Ignore the very last byte: if all others are positive, // it must be negative, so we can skip right to the 2+1 byte // end comparison at this point
orl(result, 31);
movl(len, 31); // Fallthru to tail compare
} elseif (UseSSE42Intrinsics) { // With SSE4.2, use double quad vector compare
Label COMPARE_WIDE_VECTORS, BREAK_LOOP;
testl(result, 0x0000000f); // len is zero, any bytes remaining?
jcc(Assembler::zero, DONE);
// Quick test using the already prepared vector mask
movl(len, result);
andl(len, 0x0000000f); // tail count (in bytes)
movdqu(vec1, Address(ary1, len, Address::times_1, -16));
ptest(vec1, vec2);
jcc(Assembler::zero, DONE);
jmpb(TAIL_START);
bind(BREAK_LOOP); // At least one byte in the last 16-byte vector is negative. // Set up and look at the last 16 bytes as if they were a tail
lea(ary1, Address(ary1, len, Address::times_1));
addptr(result, len); // Ignore the very last byte: if all others are positive, // it must be negative, so we can skip right to the 2+1 byte // end comparison at this point
orl(result, 15);
movl(len, 15); // Fallthru to tail compare
}
}
bind(TAIL_ADJUST); // there are negative bits in the last 4 byte block. // Adjust result and check the next three bytes
addptr(result, len);
orl(result, 3);
lea(ary1, Address(ary1, len, Address::times_1));
jmpb(COMPARE_CHAR);
bind(CHAR_ADJUST); // We are looking at a char + optional byte tail, and found that one // of the bytes in the char is negative. Adjust the result, check the // first byte and readjust if needed.
andl(result, 0xfffffffc);
testl(tmp1, 0x00000080); // little-endian, so lowest byte comes first
jccb(Assembler::notZero, DONE);
addptr(result, 1);
// That's it
bind(DONE); if (UseAVX >= 2 && UseSSE >= 2) { // clean upper bits of YMM registers
vpxor(vec1, vec1);
vpxor(vec2, vec2);
}
}
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
evpcmpeqb(mask, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
kortestql(mask, mask);
jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
addptr(limit, 64); // update since we already compared at this addr
cmpl(limit, -64);
jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
// At this point we may still need to compare -limit+result bytes. // We could execute the next two instruction and just continue via non-wide path: // cmpl(limit, 0); // jcc(Assembler::equal, COMPARE_TAIL); // true // But since we stopped at the points ary{1,2}+limit which are // not farther than 64 bytes from the ends of arrays ary{1,2}+result // (|limit| <= 32 and result < 32), // we may just compare the last 64 bytes. //
addptr(result, -64); // it is safe, bc we just came from this area
evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
evpcmpeqb(mask, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
kortestql(mask, mask);
jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
// That's it
bind(DONE); if (UseAVX >= 2) { // clean upper bits of YMM registers
vpxor(vec1, vec1);
vpxor(vec2, vec2);
}
}
void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
XMMRegister src1, int imm8, bool merge, int vlen_enc) { switch(ideal_opc) { case Op_LShiftVS:
Assembler::evpsllw(dst, mask, src1, imm8, merge, vlen_enc); break; case Op_LShiftVI:
Assembler::evpslld(dst, mask, src1, imm8, merge, vlen_enc); break; case Op_LShiftVL:
Assembler::evpsllq(dst, mask, src1, imm8, merge, vlen_enc); break; case Op_RShiftVS:
Assembler::evpsraw(dst, mask, src1, imm8, merge, vlen_enc); break; case Op_RShiftVI:
Assembler::evpsrad(dst, mask, src1, imm8, merge, vlen_enc); break; case Op_RShiftVL:
Assembler::evpsraq(dst, mask, src1, imm8, merge, vlen_enc); break; case Op_URShiftVS:
Assembler::evpsrlw(dst, mask, src1, imm8, merge, vlen_enc); break; case Op_URShiftVI:
Assembler::evpsrld(dst, mask, src1, imm8, merge, vlen_enc); break; case Op_URShiftVL:
Assembler::evpsrlq(dst, mask, src1, imm8, merge, vlen_enc); break; case Op_RotateRightV:
evrord(eType, dst, mask, src1, imm8, merge, vlen_enc); break; case Op_RotateLeftV:
evrold(eType, dst, mask, src1, imm8, merge, vlen_enc); break; default:
fatal("Unsupported masked operation"); break;
}
}
void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
XMMRegister src1, XMMRegister src2, bool merge, int vlen_enc, bool is_varshift) { switch (ideal_opc) { case Op_AddVB:
evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AddVS:
evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AddVI:
evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AddVL:
evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AddVF:
evaddps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AddVD:
evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVB:
evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVS:
evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVI:
evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVL:
evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVF:
evsubps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVD:
evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MulVS:
evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MulVI:
evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MulVL:
evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MulVF:
evmulps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MulVD:
evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_DivVF:
evdivps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_DivVD:
evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SqrtVF:
evsqrtps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SqrtVD:
evsqrtpd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AbsVB:
evpabsb(dst, mask, src2, merge, vlen_enc); break; case Op_AbsVS:
evpabsw(dst, mask, src2, merge, vlen_enc); break; case Op_AbsVI:
evpabsd(dst, mask, src2, merge, vlen_enc); break; case Op_AbsVL:
evpabsq(dst, mask, src2, merge, vlen_enc); break; case Op_FmaVF:
evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_FmaVD:
evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_VectorRearrange:
evperm(eType, dst, mask, src2, src1, merge, vlen_enc); break; case Op_LShiftVS:
evpsllw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; case Op_LShiftVI:
evpslld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; case Op_LShiftVL:
evpsllq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; case Op_RShiftVS:
evpsraw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; case Op_RShiftVI:
evpsrad(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; case Op_RShiftVL:
evpsraq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; case Op_URShiftVS:
evpsrlw(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; case Op_URShiftVI:
evpsrld(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; case Op_URShiftVL:
evpsrlq(dst, mask, src1, src2, merge, vlen_enc, is_varshift); break; case Op_RotateLeftV:
evrold(eType, dst, mask, src1, src2, merge, vlen_enc); break; case Op_RotateRightV:
evrord(eType, dst, mask, src1, src2, merge, vlen_enc); break; case Op_MaxV:
evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; case Op_MinV:
evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; case Op_XorV:
evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; case Op_OrV:
evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; case Op_AndV:
evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; default:
fatal("Unsupported masked operation"); break;
}
}
void C2_MacroAssembler::evmasked_op(int ideal_opc, BasicType eType, KRegister mask, XMMRegister dst,
XMMRegister src1, Address src2, bool merge, int vlen_enc) { switch (ideal_opc) { case Op_AddVB:
evpaddb(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AddVS:
evpaddw(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AddVI:
evpaddd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AddVL:
evpaddq(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AddVF:
evaddps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_AddVD:
evaddpd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVB:
evpsubb(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVS:
evpsubw(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVI:
evpsubd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVL:
evpsubq(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVF:
evsubps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_SubVD:
evsubpd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MulVS:
evpmullw(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MulVI:
evpmulld(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MulVL:
evpmullq(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MulVF:
evmulps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MulVD:
evmulpd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_DivVF:
evdivps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_DivVD:
evdivpd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_FmaVF:
evpfma213ps(dst, mask, src1, src2, merge, vlen_enc); break; case Op_FmaVD:
evpfma213pd(dst, mask, src1, src2, merge, vlen_enc); break; case Op_MaxV:
evpmaxs(eType, dst, mask, src1, src2, merge, vlen_enc); break; case Op_MinV:
evpmins(eType, dst, mask, src1, src2, merge, vlen_enc); break; case Op_XorV:
evxor(eType, dst, mask, src1, src2, merge, vlen_enc); break; case Op_OrV:
evor(eType, dst, mask, src1, src2, merge, vlen_enc); break; case Op_AndV:
evand(eType, dst, mask, src1, src2, merge, vlen_enc); break; default:
fatal("Unsupported masked operation"); break;
}
}
void C2_MacroAssembler::masked_op(int ideal_opc, int mask_len, KRegister dst,
KRegister src1, KRegister src2) {
BasicType etype = T_ILLEGAL; switch(mask_len) { case 2: case 4: case 8: etype = T_BYTE; break; case 16: etype = T_SHORT; break; case 32: etype = T_INT; break; case 64: etype = T_LONG; break; default: fatal("Unsupported type"); break;
}
assert(etype != T_ILLEGAL, ""); switch(ideal_opc) { case Op_AndVMask:
kand(etype, dst, src1, src2); break; case Op_OrVMask:
kor(etype, dst, src1, src2); break; case Op_XorVMask:
kxor(etype, dst, src1, src2); break; default:
fatal("Unsupported masked operation"); break;
}
}
/* * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. * If src is NaN, the result is 0. * If the src is negative infinity or any value less than or equal to the value of Integer.MIN_VALUE, * the result is equal to the value of Integer.MIN_VALUE. * If the src is positive infinity or any value greater than or equal to the value of Integer.MAX_VALUE, * the result is equal to the value of Integer.MAX_VALUE.
*/ void C2_MacroAssembler::vector_cast_float_to_int_special_cases_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4, Register rscratch, AddressLiteral float_sign_flip, int vec_enc) {
assert(rscratch != noreg || always_reachable(float_sign_flip), "missing");
Label done;
vmovdqu(xtmp1, float_sign_flip, vec_enc, rscratch);
vpcmpeqd(xtmp2, dst, xtmp1, vec_enc);
vptest(xtmp2, xtmp2, vec_enc);
jccb(Assembler::equal, done);
// Recompute the mask for remaining special value.
vpxor(xtmp2, xtmp2, xtmp3, vec_enc); // Extract SRC values corresponding to TRUE mask lanes.
vpand(xtmp4, xtmp2, src, vec_enc); // Flip mask bits so that MSB bit of MASK lanes corresponding to +ve special // values are set.
vpxor(xtmp3, xtmp2, xtmp4, vec_enc);
/* * Following routine handles special floating point values(NaN/Inf/-Inf/Max/Min) for casting operation. * If src is NaN, the result is 0. * If the src is negative infinity or any value less than or equal to the value of Long.MIN_VALUE, * the result is equal to the value of Long.MIN_VALUE. * If the src is positive infinity or any value greater than or equal to the value of Long.MAX_VALUE, * the result is equal to the value of Long.MAX_VALUE.
*/ void C2_MacroAssembler::vector_cast_double_to_long_special_cases_evex(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, KRegister ktmp1, KRegister ktmp2, Register rscratch, AddressLiteral double_sign_flip, int vec_enc) {
assert(rscratch != noreg || always_reachable(double_sign_flip), "missing");
Label done; // Compare the destination lanes with float_sign_flip // value to get mask for all special values.
movdqu(xtmp1, float_sign_flip, rscratch);
vpcmpeqd(xtmp2, dst, xtmp1, Assembler::AVX_128bit);
ptest(xtmp2, xtmp2);
jccb(Assembler::equal, done);
// Flip float_sign_flip to get max integer value.
vpcmpeqd(xtmp4, xtmp4, xtmp4, Assembler::AVX_128bit);
pxor(xtmp1, xtmp4);
// Set detination lanes corresponding to unordered source lanes as zero.
vpxor(xtmp4, xtmp4, xtmp4, src_vec_enc);
vcmppd(xtmp3, src, src, Assembler::UNORD_Q, src_vec_enc);
// Shuffle mask vector and pack lower doubles word from each quadword lane.
vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
vblendvps(dst, dst, xtmp4, xtmp3, Assembler::AVX_128bit);
// Recompute the mask for remaining special value.
pxor(xtmp2, xtmp3); // Extract mask corresponding to non-negative source lanes.
vcmppd(xtmp3, src, xtmp4, Assembler::NLT_UQ, src_vec_enc);
// Shuffle mask vector and pack lower doubles word from each quadword lane.
vector_crosslane_doubleword_pack_avx(xtmp3, xtmp3, xtmp4, xtmp5, 0x88, src_vec_enc);
pand(xtmp3, xtmp2);
// Replace destination lanes holding special value(0x80000000) with max int // if corresponding source lane holds a +ve value.
vblendvps(dst, dst, xtmp1, xtmp3, Assembler::AVX_128bit);
bind(done);
}
/* * Algorithm for vector D2L and F2I conversions:- * a) Perform vector D2L/F2I cast. * b) Choose fast path if none of the result vector lane contains 0x80000000 value. * It signifies that source value could be any of the special floating point * values(NaN,-Inf,Inf,Max,-Min). * c) Set destination to zero if source is NaN value. * d) Replace 0x80000000 with MaxInt if source lane contains a +ve value.
*/
void C2_MacroAssembler::vbroadcast(BasicType bt, XMMRegister dst, int imm32, Register rtmp, int vec_enc) { int lane_size = type2aelembytes(bt); bool is_LP64 = LP64_ONLY(true) NOT_LP64(false); if ((is_LP64 || lane_size < 8) &&
((is_non_subword_integral_type(bt) && VM_Version::supports_avx512vl()) ||
(is_subword_type(bt) && VM_Version::supports_avx512vlbw()))) {
movptr(rtmp, imm32); switch(lane_size) { case 1 : evpbroadcastb(dst, rtmp, vec_enc); break; case 2 : evpbroadcastw(dst, rtmp, vec_enc); break; case 4 : evpbroadcastd(dst, rtmp, vec_enc); break; case 8 : evpbroadcastq(dst, rtmp, vec_enc); break;
fatal("Unsupported lane size %d", lane_size); break;
}
} else {
movptr(rtmp, imm32);
LP64_ONLY(movq(dst, rtmp)) NOT_LP64(movdl(dst, rtmp)); switch(lane_size) { case 1 : vpbroadcastb(dst, dst, vec_enc); break; case 2 : vpbroadcastw(dst, dst, vec_enc); break; case 4 : vpbroadcastd(dst, dst, vec_enc); break; case 8 : vpbroadcastq(dst, dst, vec_enc); break;
fatal("Unsupported lane size %d", lane_size); break;
}
}
}
// // Following is lookup table based popcount computation algorithm:- // Index Bit set count // [ 0000 -> 0, // 0001 -> 1, // 0010 -> 1, // 0011 -> 2, // 0100 -> 1, // 0101 -> 2, // 0110 -> 2, // 0111 -> 3, // 1000 -> 1, // 1001 -> 2, // 1010 -> 3, // 1011 -> 3, // 1100 -> 2, // 1101 -> 3, // 1111 -> 4 ] // a. Count the number of 1s in 4 LSB bits of each byte. These bits are used as // shuffle indices for lookup table access. // b. Right shift each byte of vector lane by 4 positions. // c. Count the number of 1s in 4 MSB bits each byte. These bits are used as // shuffle indices for lookup table access. // d. Add the bitset count of upper and lower 4 bits of each byte. // e. Unpack double words to quad words and compute sum of absolute difference of bitset // count of all the bytes of a quadword. // f. Perform step e. for upper 128bit vector lane. // g. Pack the bitset count of quadwords back to double word. // h. Unpacking and packing operations are not needed for 64bit vector lane.
// Bit reversal algorithm first reverses the bits of each byte followed by // a byte level reversal for multi-byte primitive types (short/int/long). // Algorithm performs a lookup table access to get reverse bit sequence // corresponding to a 4 bit value. Thus a reverse bit sequence for a byte // is obtained by swapping the reverse bit sequences of upper and lower // nibble of a byte. void C2_MacroAssembler::vector_reverse_bit(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, Register rtmp, int vec_enc) { if (VM_Version::supports_avx512vlbw()) {
// Get the reverse bit sequence of lower nibble of each byte.
vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_reverse_bit_lut()), vec_enc, noreg);
vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc);
evpandq(dst, xtmp2, src, vec_enc);
vpshufb(dst, xtmp1, dst, vec_enc);
vpsllq(dst, dst, 4, vec_enc);
// Get the reverse bit sequence of upper nibble of each byte.
vpandn(xtmp2, xtmp2, src, vec_enc);
vpsrlq(xtmp2, xtmp2, 4, vec_enc);
vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
// Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
evporq(xtmp2, dst, xtmp2, vec_enc);
vector_reverse_byte(bt, dst, xtmp2, vec_enc);
} elseif(vec_enc == Assembler::AVX_512bit) { // Shift based bit reversal.
assert(bt == T_LONG || bt == T_INT, "");
// Swap lower and upper nibble of each byte.
vector_swap_nbits(4, 0x0F0F0F0F, xtmp1, src, xtmp2, rtmp, vec_enc);
// Swap two least and most significant bits of each nibble.
vector_swap_nbits(2, 0x33333333, dst, xtmp1, xtmp2, rtmp, vec_enc);
// Get the reverse bit sequence of lower nibble of each byte.
vpand(dst, xtmp2, src, vec_enc);
vpshufb(dst, xtmp1, dst, vec_enc);
vpsllq(dst, dst, 4, vec_enc);
// Get the reverse bit sequence of upper nibble of each byte.
vpandn(xtmp2, xtmp2, src, vec_enc);
vpsrlq(xtmp2, xtmp2, 4, vec_enc);
vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
// Perform logical OR operation b/w left shifted reverse bit sequence of lower nibble and // right shifted reverse bit sequence of upper nibble to obtain the reverse bit sequence of each byte.
vpor(xtmp2, dst, xtmp2, vec_enc);
vector_reverse_byte(bt, dst, xtmp2, vec_enc);
}
}
// Galois field instruction based bit reversal based on following algorithm. // http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
vpbroadcastq(xtmp, mask, vec_enc, rscratch);
vgf2p8affineqb(xtmp, src, xtmp, 0, vec_enc);
vector_reverse_byte(bt, dst, xtmp, vec_enc);
}
void C2_MacroAssembler::vector_reverse_byte64(BasicType bt, XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, Register rtmp, int vec_enc) { // Shift based bit reversal.
assert(VM_Version::supports_evex(), ""); switch(bt) { case T_LONG: // Swap upper and lower double word of each quad word.
evprorq(xtmp1, k0, src, 32, true, vec_enc);
evprord(xtmp1, k0, xtmp1, 16, true, vec_enc);
vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); break; case T_INT: // Swap upper and lower word of each double word.
evprord(xtmp1, k0, src, 16, true, vec_enc);
vector_swap_nbits(8, 0x00FF00FF, dst, xtmp1, xtmp2, rtmp, vec_enc); break; case T_CHAR: case T_SHORT: // Swap upper and lower byte of each word.
vector_swap_nbits(8, 0x00FF00FF, dst, src, xtmp2, rtmp, vec_enc); break; case T_BYTE:
evmovdquq(dst, k0, src, true, vec_enc); break; default:
fatal("Unsupported type %s", type2name(bt)); break;
}
}
void C2_MacroAssembler::vector_reverse_byte(BasicType bt, XMMRegister dst, XMMRegister src, int vec_enc) { if (bt == T_BYTE) { if (VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit) {
evmovdquq(dst, k0, src, true, vec_enc);
} else {
vmovdqu(dst, src);
} return;
} // Perform byte reversal by shuffling the bytes of a multi-byte primitive type using // pre-computed shuffle indices. switch(bt) { case T_LONG:
vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_long()), vec_enc, noreg); break; case T_INT:
vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_int()), vec_enc, noreg); break; case T_CHAR: case T_SHORT:
vmovdqu(dst, ExternalAddress(StubRoutines::x86::vector_reverse_byte_perm_mask_short()), vec_enc, noreg); break; default:
fatal("Unsupported type %s", type2name(bt)); break;
}
vpshufb(dst, src, dst, vec_enc);
}
void C2_MacroAssembler::vector_count_leading_zeros_evex(BasicType bt, XMMRegister dst, XMMRegister src,
XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3,
KRegister ktmp, Register rtmp, bool merge, int vec_enc) {
assert(is_integral_type(bt), "");
assert(VM_Version::supports_avx512vl() || vec_enc == Assembler::AVX_512bit, "");
assert(VM_Version::supports_avx512cd(), ""); switch(bt) { case T_LONG:
evplzcntq(dst, ktmp, src, merge, vec_enc); break; case T_INT:
evplzcntd(dst, ktmp, src, merge, vec_enc); break; case T_SHORT:
vpternlogd(xtmp1, 0xff, xtmp1, xtmp1, vec_enc);
vpunpcklwd(xtmp2, xtmp1, src, vec_enc);
evplzcntd(xtmp2, ktmp, xtmp2, merge, vec_enc);
vpunpckhwd(dst, xtmp1, src, vec_enc);
evplzcntd(dst, ktmp, dst, merge, vec_enc);
vpackusdw(dst, xtmp2, dst, vec_enc); break; case T_BYTE: // T1 = Compute leading zero counts of 4 LSB bits of each byte by // accessing the lookup table. // T2 = Compute leading zero counts of 4 MSB bits of each byte by // accessing the lookup table. // Add T1 to T2 if 4 MSB bits of byte are all zeros.
assert(VM_Version::supports_avx512bw(), "");
evmovdquq(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), vec_enc, rtmp);
vbroadcast(T_INT, dst, 0x0F0F0F0F, rtmp, vec_enc);
vpand(xtmp2, dst, src, vec_enc);
vpshufb(xtmp2, xtmp1, xtmp2, vec_enc);
vpsrlw(xtmp3, src, 4, vec_enc);
vpand(xtmp3, dst, xtmp3, vec_enc);
vpshufb(dst, xtmp1, xtmp3, vec_enc);
vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
evpcmpeqb(ktmp, xtmp1, xtmp3, vec_enc);
evpaddb(dst, ktmp, dst, xtmp2, true, vec_enc); break; default:
fatal("Unsupported type %s", type2name(bt)); break;
}
}
void C2_MacroAssembler::vector_count_leading_zeros_byte_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
vmovdqu(xtmp1, ExternalAddress(StubRoutines::x86::vector_count_leading_zeros_lut()), rtmp);
vbroadcast(T_INT, xtmp2, 0x0F0F0F0F, rtmp, vec_enc); // T1 = Compute leading zero counts of 4 LSB bits of each byte by // accessing the lookup table.
vpand(dst, xtmp2, src, vec_enc);
vpshufb(dst, xtmp1, dst, vec_enc); // T2 = Compute leading zero counts of 4 MSB bits of each byte by // accessing the lookup table.
vpsrlw(xtmp3, src, 4, vec_enc);
vpand(xtmp3, xtmp2, xtmp3, vec_enc);
vpshufb(xtmp2, xtmp1, xtmp3, vec_enc); // Add T1 to T2 if 4 MSB bits of byte are all zeros.
vpxor(xtmp1, xtmp1, xtmp1, vec_enc);
vpcmpeqb(xtmp3, xtmp1, xtmp3, vec_enc);
vpaddb(dst, dst, xtmp2, vec_enc);
vpblendvb(dst, xtmp2, dst, xtmp3, vec_enc);
}
void C2_MacroAssembler::vector_count_leading_zeros_short_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
vector_count_leading_zeros_byte_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); // Add zero counts of lower byte and upper byte of a word if // upper byte holds a zero value.
vpsrlw(xtmp3, src, 8, vec_enc); // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
vpcmpeqw(xtmp3, xtmp1, xtmp3, vec_enc);
vpsllw(xtmp2, dst, 8, vec_enc);
vpaddw(xtmp2, xtmp2, dst, vec_enc);
vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
vpsrlw(dst, dst, 8, vec_enc);
}
void C2_MacroAssembler::vector_count_leading_zeros_int_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, int vec_enc) { // Since IEEE 754 floating point format represents mantissa in 1.0 format // hence biased exponent can be used to compute leading zero count as per // following formula:- // LZCNT = 32 - (biased_exp - 127) // Special handling has been introduced for Zero, Max_Int and -ve source values.
// Replace -ve exponent with zero, exponent is -ve when src // lane contains a zero value.
vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
vblendvps(dst, dst, xtmp2, dst, vec_enc);
// Replace LZCNT with a value 1 if corresponding source lane // contains max_int value.
vpblendvb(dst, dst, xtmp3, xtmp2, vec_enc);
// Replace biased_exp with 0 if source lane value is less than zero.
vpxor(xtmp2, xtmp2, xtmp2, vec_enc);
vblendvps(dst, dst, xtmp2, src, vec_enc);
}
void C2_MacroAssembler::vector_count_leading_zeros_long_avx(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, int vec_enc) {
vector_count_leading_zeros_short_avx(dst, src, xtmp1, xtmp2, xtmp3, rtmp, vec_enc); // Add zero counts of lower word and upper word of a double word if // upper word holds a zero value.
vpsrld(xtmp3, src, 16, vec_enc); // xtmp1 is set to all zeros by vector_count_leading_zeros_byte_avx.
vpcmpeqd(xtmp3, xtmp1, xtmp3, vec_enc);
vpslld(xtmp2, dst, 16, vec_enc);
vpaddd(xtmp2, xtmp2, dst, vec_enc);
vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
vpsrld(dst, dst, 16, vec_enc); // Add zero counts of lower doubleword and upper doubleword of a // quadword if upper doubleword holds a zero value.
vpsrlq(xtmp3, src, 32, vec_enc);
vpcmpeqq(xtmp3, xtmp1, xtmp3, vec_enc);
vpsllq(xtmp2, dst, 32, vec_enc);
vpaddq(xtmp2, xtmp2, dst, vec_enc);
vpblendvb(dst, dst, xtmp2, xtmp3, vec_enc);
vpsrlq(dst, dst, 32, vec_enc);
}
void C2_MacroAssembler::rearrange_bytes(XMMRegister dst, XMMRegister shuffle, XMMRegister src, XMMRegister xtmp1,
XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp, KRegister ktmp, int vlen_enc) {
assert(VM_Version::supports_avx512bw(), ""); // Byte shuffles are inlane operations and indices are determined using // lower 4 bit of each shuffle lane, thus all shuffle indices are // normalized to index range 0-15. This makes sure that all the multiples // of an index value are placed at same relative position in 128 bit // lane i.e. elements corresponding to shuffle indices 16, 32 and 64 // will be 16th element in their respective 128 bit lanes.
movl(rtmp, 16);
evpbroadcastb(xtmp1, rtmp, vlen_enc);
// Compute a mask for shuffle vector by comparing indices with expression INDEX < 16, // Broadcast first 128 bit lane across entire vector, shuffle the vector lanes using // original shuffle indices and move the shuffled lanes corresponding to true // mask to destination vector.
evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
evshufi64x2(xtmp2, src, src, 0x0, vlen_enc);
evpshufb(dst, ktmp, xtmp2, shuffle, false, vlen_enc);
// Perform above steps with lane comparison expression as INDEX >= 16 && INDEX < 32 // and broadcasting second 128 bit lane.
evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
vpsllq(xtmp2, xtmp1, 0x1, vlen_enc);
evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
evshufi64x2(xtmp3, src, src, 0x55, vlen_enc);
evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
// Perform above steps with lane comparison expression as INDEX >= 32 && INDEX < 48 // and broadcasting third 128 bit lane.
evpcmpb(ktmp, k0, shuffle, xtmp2, Assembler::nlt, true, vlen_enc);
vpaddb(xtmp1, xtmp1, xtmp2, vlen_enc);
evpcmpb(ktmp, ktmp, shuffle, xtmp1, Assembler::lt, true, vlen_enc);
evshufi64x2(xtmp3, src, src, 0xAA, vlen_enc);
evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
// Perform above steps with lane comparison expression as INDEX >= 48 && INDEX < 64 // and broadcasting third 128 bit lane.
evpcmpb(ktmp, k0, shuffle, xtmp1, Assembler::nlt, true, vlen_enc);
vpsllq(xtmp2, xtmp2, 0x1, vlen_enc);
evpcmpb(ktmp, ktmp, shuffle, xtmp2, Assembler::lt, true, vlen_enc);
evshufi64x2(xtmp3, src, src, 0xFF, vlen_enc);
evpshufb(dst, ktmp, xtmp3, shuffle, true, vlen_enc);
}
Messung V0.5 in Prozent
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.147Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.