/* * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// Declaration and definition of StubGenerator (no .hpp file). // For a more detailed description of the stub routine structure // see the comment in stubRoutines.hpp
// Call stubs are used to call Java from C // // Arguments: // c_rarg0: call wrapper address address // c_rarg1: result address // c_rarg2: result type BasicType // c_rarg3: method Method* // c_rarg4: (interpreter) entry point address // c_rarg5: parameters intptr_t* // c_rarg6: parameter size (in words) int // c_rarg7: thread Thread* // // There is no return from the stub itself as any Java result // is written to result // // we save r30 (lr) as the return PC at the base of the frame and // link r29 (fp) below it as the frame pointer installing sp (r31) // into fp. // // we save r0-r7, which accounts for all the c arguments. // // TODO: strictly do we need to save them all? they are treated as // volatile by C so could we omit saving the ones we are going to // place in global registers (thread? method?) or those we only use // during setup of the Java call? // // we don't need to save r8 which C uses as an indirect result location // return register. // // we don't need to save r9-r15 which both C and Java treat as // volatile // // we don't need to save r16-18 because Java does not use them // // we save r19-r28 which Java uses as scratch registers and C // expects to be callee-save // // we save the bottom 64 bits of each value stored in v8-v15; it is // the responsibility of the caller to preserve larger values. // // so the stub frame looks like this when we enter Java code // // [ return_from_Java ] <--- sp // [ argument word n ] // ... // -27 [ argument word 1 ] // -26 [ saved v15 ] <--- sp_after_call // -25 [ saved v14 ] // -24 [ saved v13 ] // -23 [ saved v12 ] // -22 [ saved v11 ] // -21 [ saved v10 ] // -20 [ saved v9 ] // -19 [ saved v8 ] // -18 [ saved r28 ] // -17 [ saved r27 ] // -16 [ saved r26 ] // -15 [ saved r25 ] // -14 [ saved r24 ] // -13 [ saved r23 ] // -12 [ saved r22 ] // -11 [ saved r21 ] // -10 [ saved r20 ] // -9 [ saved r19 ] // -8 [ call wrapper (r0) ] // -7 [ result (r1) ] // -6 [ result type (r2) ] // -5 [ method (r3) ] // -4 [ entry point (r4) ] // -3 [ parameters (r5) ] // -2 [ parameter size (r6) ] // -1 [ thread (r7) ] // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) // 1 [ saved lr (r30) ]
// Call stub stack layout word offsets from fp enum call_stub_layout {
sp_after_call_off = -26,
// set up frame and move sp to end of save area
__ enter();
__ sub(sp, rfp, -sp_after_call_off * wordSize);
// save register parameters and Java scratch/global registers // n.b. we save thread even though it gets installed in // rthread because we want to sanity check rthread later
__ str(c_rarg7, thread);
__ strw(c_rarg6, parameter_size);
__ stp(c_rarg4, c_rarg5, entry_point);
__ stp(c_rarg2, c_rarg3, result_type);
__ stp(c_rarg0, c_rarg1, call_wrapper);
// install Java thread in global register now we have saved // whatever value it held
__ mov(rthread, c_rarg7); // And method
__ mov(rmethod, c_rarg3);
// set up the heapbase register
__ reinit_heapbase();
#ifdef ASSERT // make sure we have no pending exceptions
{
Label L;
__ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
__ cmp(rscratch1, (u1)NULL_WORD);
__ br(Assembler::EQ, L);
__ stop("StubRoutines::call_stub: entered with pending exception");
__ BIND(L);
} #endif // pass parameters if any
__ mov(esp, sp);
__ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
__ andr(sp, rscratch1, -2 * wordSize);
BLOCK_COMMENT("pass parameters if any");
Label parameters_done; // parameter count is still in c_rarg6 // and parameter pointer identifying param 1 is in c_rarg5
__ cbzw(c_rarg6, parameters_done);
// we do this here because the notify will already have been done // if we get to the next instruction via an exception // // n.b. adding this instruction here affects the calculation of // whether or not a routine returns to the call stub (used when // doing stack walks) since the normal test is to check the return // pc against the address saved below. so we may need to allow for // this extra instruction in the check.
// save current address for use by exception handling code
return_address = __ pc();
// store result depending on type (everything that is not // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) // n.b. this assumes Java returns an integral result in r0 // and a floating result in j_farg0
__ ldr(j_rarg2, result);
Label is_long, is_float, is_double, exit;
__ ldr(j_rarg1, result_type);
__ cmp(j_rarg1, (u1)T_OBJECT);
__ br(Assembler::EQ, is_long);
__ cmp(j_rarg1, (u1)T_LONG);
__ br(Assembler::EQ, is_long);
__ cmp(j_rarg1, (u1)T_FLOAT);
__ br(Assembler::EQ, is_float);
__ cmp(j_rarg1, (u1)T_DOUBLE);
__ br(Assembler::EQ, is_double);
// handle T_INT case
__ strw(r0, Address(j_rarg2));
__ BIND(exit);
// pop parameters
__ sub(esp, rfp, -sp_after_call_off * wordSize);
// Return point for a Java call if there's an exception thrown in // Java code. The exception is caught and transformed into a // pending exception stored in JavaThread that can be tested from // within the VM. // // Note: Usually the parameters are removed by the callee. In case // of an exception crossing an activation frame boundary, that is // not the case if the callee is compiled code => need to setup the // rsp. // // r0: exception oop
// complete return to VM
assert(StubRoutines::_call_stub_return_address != NULL, "_call_stub_return_address must have been generated before");
__ b(StubRoutines::_call_stub_return_address);
return start;
}
// Continuation point for runtime calls returning with a pending // exception. The pending exception check happened in the runtime // or native call stub. The pending exception in Thread is // converted into a Java-level exception. // // Contract with Java-level exception handlers: // r0: exception // r3: throwing pc // // NOTE: At entry of this stub, exception-pc must be in LR !!
// NOTE: this is always used as a jump target within generated code // so it just needs to be generated code with no x86 prolog
// Upon entry, LR points to the return address returning into // Java (interpreted or compiled) code; i.e., the return address // becomes the throwing pc. // // Arguments pushed before the runtime call are still on the stack // but the exception handler will reset the stack pointer -> // ignore them. A potential result in registers can be ignored as // well.
#ifdef ASSERT // make sure this code is only executed if there is a pending exception
{
Label L;
__ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
__ cbnz(rscratch1, L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
} #endif
// compute exception handler into r19
// call the VM to find the handler address associated with the // caller address. pass thread in r0 and caller pc (ret address) // in r1. n.b. the caller pc is in lr, unlike x86 where it is on // the stack.
__ mov(c_rarg1, lr); // lr will be trashed by the VM call so we move it to R19 // (callee-saved) because we also need to pass it to the handler // returned by this call.
__ mov(r19, lr);
BLOCK_COMMENT("call exception_handler_for_return_address");
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
SharedRuntime::exception_handler_for_return_address),
rthread, c_rarg1); // Reinitialize the ptrue predicate register, in case the external runtime // call clobbers ptrue reg, as we may return to SVE compiled code.
__ reinitialize_ptrue();
// we should not really care that lr is no longer the callee // address. we saved the value the handler needs in r19 so we can // just copy it to r3. however, the C2 handler will push its own // frame and then calls into the VM and the VM code asserts that // the PC for the frame above the handler belongs to a compiled // Java method. So, we restore lr here to satisfy that assert.
__ mov(lr, r19); // setup r0 & r3 & clear pending exception
__ mov(r3, r19);
__ mov(r19, r0);
__ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
__ str(zr, Address(rthread, Thread::pending_exception_offset()));
#ifdef ASSERT // make sure exception is set
{
Label L;
__ cbnz(r0, L);
__ stop("StubRoutines::forward exception: no pending exception (2)");
__ bind(L);
} #endif
// continue at exception handler // r0: exception // r3: throwing pc // r19: exception handler
__ verify_oop(r0);
__ br(r19);
// object is in r0 // make sure object is 'reasonable'
__ cbz(r0, exit); // if obj is NULL it is OK
#if INCLUDE_ZGC if (UseZGC) { // Check if mask is good. // verifies that ZAddressBadMask & r0 == 0
__ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
__ andr(c_rarg2, r0, c_rarg3);
__ cbnz(c_rarg2, error);
} #endif
// Check if the oop is in the right area of memory
__ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
__ andr(c_rarg2, r0, c_rarg3);
__ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
// Compare c_rarg2 and c_rarg3. We don't use a compare // instruction here because the flags register is live.
__ eor(c_rarg2, c_rarg2, c_rarg3);
__ cbnz(c_rarg2, error);
// make sure klass is 'reasonable', which is not zero.
__ load_klass(r0, r0); // get klass
__ cbz(r0, error); // if klass is NULL it is broken
// The inner part of zero_words(). This is the bulk operation, // zeroing words in blocks, possibly using DC ZVA to do it. The // caller is responsible for zeroing the last few words. // // Inputs: // r10: the HeapWord-aligned base address of an array to zero. // r11: the count in HeapWords, r11 > 0. // // Returns r10 and r11, adjusted for the caller to clear. // r10: the base address of the tail of words left to clear. // r11: the number of words in the tail. // r11 < MacroAssembler::zero_words_block_size.
// Bulk copy of blocks of 8 words. // // count is a count of words. // // Precondition: count >= 8 // // Postconditions: // // The least significant bit of count contains the remaining count // of words to copy. The rest of count is trash. // // s and d are adjusted to point to the remaining words to copy // void generate_copy_longs(Label &start, Register s, Register d, Register count,
copy_direction direction) { int unit = wordSize * direction; int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
#ifdef ASSERT // Make sure we are never given < 8 words
{
Label L;
__ cmp(count, (u1)8);
__ br(Assembler::GE, L);
__ stop("genrate_copy_longs called with < 8 words");
__ bind(L);
} #endif
__ tbz(count, 1, L2);
__ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
__ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
__ bind(L2);
}
__ ret(lr);
if (AvoidUnalignedAccesses) {
Label drain, again; // Register order for storing. Order is different for backward copy.
__ bind(unaligned_copy_long);
// source address is even aligned, target odd aligned // // when forward copying word pairs we read long pairs at offsets // {0, 2, 4, 6} (in long words). when backwards copying we read // long pairs at offsets {-2, -4, -6, -8}. We adjust the source // address by -2 in the forwards case so we can compute the // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 // or -1. // // when forward copying we need to store 1 word, 3 pairs and // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a // zero offset We adjust the destination by -1 which means we // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. // // When backwards copyng we need to store 1 word, 3 pairs and // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use // offsets {1, 3, 5, 7, 8} * unit.
// Fill 8 registers // // for forwards copy s was offset by -16 from the original input // value of s so the register contents are at these offsets // relative to the 64 bit block addressed by that original input // and so on for each successive 64 byte block when s is updated // // t0 at offset 0, t1 at offset 8 // t2 at offset 16, t3 at offset 24 // t4 at offset 32, t5 at offset 40 // t6 at offset 48, t7 at offset 56
// for backwards copy s was not offset so the register contents // are at these offsets into the preceding 64 byte block // relative to that original input and so on for each successive // preceding 64 byte block when s is updated. this explains the // slightly counter-intuitive looking pattern of register usage // in the stp instructions for backwards copy. // // t0 at offset -16, t1 at offset -8 // t2 at offset -32, t3 at offset -24 // t4 at offset -48, t5 at offset -40 // t6 at offset -64, t7 at offset -56
if (direction == copy_forwards) { // allowing for the offset of -8 the store instructions place // registers into the target 64 bit block at the following // offsets // // t0 at offset 0 // t1 at offset 8, t2 at offset 16 // t3 at offset 24, t4 at offset 32 // t5 at offset 40, t6 at offset 48 // t7 at offset 56
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ ldp(t0, t1, Address(s, 2 * unit));
__ stp(t3, t4, Address(d, 4 * unit));
__ ldp(t2, t3, Address(s, 4 * unit));
__ stp(t5, t6, Address(d, 6 * unit));
__ ldp(t4, t5, Address(s, 6 * unit));
__ str(t7, Address(__ pre(d, 8 * unit)));
__ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
} else { // d was not offset when we started so the registers are // written into the 64 bit block preceding d with the following // offsets // // t1 at offset -8 // t3 at offset -24, t0 at offset -16 // t5 at offset -48, t2 at offset -32 // t7 at offset -56, t4 at offset -48 // t6 at offset -64 // // note that this matches the offsets previously noted for the // loads
// Drain // // this uses the same pattern of offsets and register arguments // as above
__ bind(drain); if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ stp(t3, t4, Address(d, 4 * unit));
__ stp(t5, t6, Address(d, 6 * unit));
__ str(t7, Address(__ pre(d, 8 * unit)));
} else {
__ str(t1, Address(d, 1 * unit));
__ stp(t3, t0, Address(d, 3 * unit));
__ stp(t5, t2, Address(d, 5 * unit));
__ stp(t7, t4, Address(d, 7 * unit));
__ str(t6, Address(__ pre(d, 8 * unit)));
} // now we need to copy any remaining part block which may // include a 4 word block subblock and/or a 2 word subblock. // bits 2 and 1 in the count are the tell-tale for whether we // have each such subblock
{
Label L1, L2;
__ tbz(count, exact_log2(4), L1); // this is the same as above but copying only 4 longs hence // with only one intervening stp between the str instructions // but note that the offsets and registers still follow the // same pattern
__ ldp(t0, t1, Address(s, 2 * unit));
__ ldp(t2, t3, Address(__ pre(s, 4 * unit))); if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ stp(t1, t2, Address(d, 2 * unit));
__ str(t3, Address(__ pre(d, 4 * unit)));
} else {
__ str(t1, Address(d, 1 * unit));
__ stp(t3, t0, Address(d, 3 * unit));
__ str(t2, Address(__ pre(d, 4 * unit)));
}
__ bind(L1);
__ tbz(count, 1, L2); // this is the same as above but copying only 2 longs hence // there is no intervening stp between the str instructions // but note that the offset and register patterns are still // the same
__ ldp(t0, t1, Address(__ pre(s, 2 * unit))); if (direction == copy_forwards) {
__ str(t0, Address(d, 1 * unit));
__ str(t1, Address(__ pre(d, 2 * unit)));
} else {
__ str(t1, Address(d, 1 * unit));
__ str(t0, Address(__ pre(d, 2 * unit)));
}
__ bind(L2);
// for forwards copy we need to re-adjust the offsets we // applied so that s and d are follow the last words written
// Small copy: less than 16 bytes. // // NB: Ignores all of the bits of count which represent more than 15 // bytes, so a caller doesn't have to mask them.
void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { bool is_backwards = step < 0;
size_t granularity = uabs(step); int direction = is_backwards ? -1 : 1; int unit = wordSize * direction;
Label Lword, Lint, Lshort, Lbyte;
assert(granularity
&& granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
constRegister t0 = r3, t1 = r4, t2 = r5, t3 = r6;
// ??? I don't know if this bit-test-and-branch is the right thing // to do. It does a lot of jumping, resulting in several // mispredicted branches. It might make more sense to do this // with something like Duff's device with a single computed branch.
__ tbz(count, 3 - exact_log2(granularity), Lword);
__ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
__ str(tmp, Address(__ adjust(d, unit, is_backwards)));
__ bind(Lword);
// All-singing all-dancing memory copy. // // Copy count units of memory from s to d. The size of a unit is // step, which can be positive or negative depending on the direction // of copy. If is_aligned is false, we align the source address. //
// 65..80/96 bytes // (96 bytes if SIMD because we do 32 byes per instruction)
__ bind(copy80); if (UseSIMDForMemoryOps) {
__ ldpq(v0, v1, Address(s, 0));
__ ldpq(v2, v3, Address(s, 32)); // Unaligned pointers can be an issue for copying. // The issue has more chances to happen when granularity of data is // less than 4(sizeof(jint)). Pointers for arrays of jint are at least // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned. // The most performance drop has been seen for the range 65-80 bytes. // For such cases using the pair of ldp/stp instead of the third pair of // ldpq/stpq fixes the performance issue. if (granularity < sizeof (jint)) {
Label copy96;
__ cmp(count, u1(80/granularity));
__ br(Assembler::HI, copy96);
__ ldp(t0, t1, Address(send, -16));
// Now we've got the small case out of the way we can align the // source address on a 2-word boundary.
Label aligned;
if (is_aligned) { // We may have to adjust by 1 word to get s 2-word-aligned.
__ tbz(s, exact_log2(wordSize), aligned);
__ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
__ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
__ sub(count, count, wordSize/granularity);
} else { if (is_backwards) {
__ andr(rscratch2, s, 2 * wordSize - 1);
} else {
__ neg(rscratch2, s);
__ andr(rscratch2, rscratch2, 2 * wordSize - 1);
} // rscratch2 is the byte adjustment needed to align s.
__ cbz(rscratch2, aligned); int shift = exact_log2(granularity); if (shift) __ lsr(rscratch2, rscratch2, shift);
__ sub(count, count, rscratch2);
#if 0 // ?? This code is only correct for a disjoint copy. It may or // may not make sense to use it in that case.
// Copy the first pair; s and d may not be aligned.
__ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
__ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
// We have a count of units and some trailing bytes. Adjust the // count and do a bulk copy of words.
__ lsr(rscratch2, count, exact_log2(wordSize/granularity)); if (direction == copy_forwards)
__ bl(copy_f); else
__ bl(copy_b);
// And the tail.
copy_memory_small(s, d, count, tmp, step);
if (granularity >= 8) __ bind(copy8); if (granularity >= 4) __ bind(copy4);
__ bind(finish);
}
// Scan over array at a for count oops, verifying each one. // Preserves a and count, clobbers rscratch1 and rscratch2. void verify_oop_array (int size, Register a, Register count, Register temp) {
Label loop, end;
__ mov(rscratch1, a);
__ mov(rscratch2, zr);
__ bind(loop);
__ cmp(rscratch2, count);
__ br(Assembler::HS, end); if (size == wordSize) {
__ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
__ verify_oop(temp);
} else {
__ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
__ decode_heap_oop(temp); // calls verify_oop
}
__ add(rscratch2, rscratch2, 1);
__ b(loop);
__ bind(end);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // is_oop - true => oop array, so generate store check code // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let // the hardware handle it. The two dwords within qwords that span // cache line boundaries will still be loaded and stored atomically. // // Side Effects: // disjoint_int_copy_entry is set to the no-overlap entry point // used by generate_conjoint_int_oop_copy(). //
address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry, constchar *name, bool dest_uninitialized = false) { Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
RegSet saved_reg = RegSet::of(s, d, count);
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
__ enter();
if (entry != NULL) {
*entry = __ pc(); // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
BLOCK_COMMENT("Entry:");
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // is_oop - true => oop array, so generate store check code // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let // the hardware handle it. The two dwords within qwords that span // cache line boundaries will still be loaded and stored atomically. //
address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
address *entry, constchar *name, bool dest_uninitialized = false) { Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
RegSet saved_regs = RegSet::of(s, d, count);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
__ enter();
if (entry != NULL) {
*entry = __ pc(); // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
BLOCK_COMMENT("Entry:");
}
// use fwd copy when (d-s) above_equal (count*size)
__ sub(rscratch1, d, s);
__ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
__ br(Assembler::HS, nooverlap_target);
DecoratorSet decorators = IN_HEAP | IS_ARRAY; if (dest_uninitialized) {
decorators |= IS_DEST_UNINITIALIZED;
} if (aligned) {
decorators |= ARRAYCOPY_ALIGNED;
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, // we let the hardware handle it. The one to eight bytes within words, // dwords or qwords that span cache line boundaries will still be loaded // and stored atomically. // // Side Effects: // disjoint_byte_copy_entry is set to the no-overlap entry point // // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, // we let the hardware handle it. The one to eight bytes within words, // dwords or qwords that span cache line boundaries will still be loaded // and stored atomically. // // Side Effects: // disjoint_byte_copy_entry is set to the no-overlap entry point // used by generate_conjoint_byte_copy(). //
address generate_disjoint_byte_copy(bool aligned, address* entry, constchar *name) { constbool not_oop = false; return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, // we let the hardware handle it. The one to eight bytes within words, // dwords or qwords that span cache line boundaries will still be loaded // and stored atomically. //
address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
address* entry, constchar *name) { constbool not_oop = false; return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we // let the hardware handle it. The two or four words within dwords // or qwords that span cache line boundaries will still be loaded // and stored atomically. // // Side Effects: // disjoint_short_copy_entry is set to the no-overlap entry point // used by generate_conjoint_short_copy(). //
address generate_disjoint_short_copy(bool aligned,
address* entry, constchar *name) { constbool not_oop = false; return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we // let the hardware handle it. The two or four words within dwords // or qwords that span cache line boundaries will still be loaded // and stored atomically. //
address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
address *entry, constchar *name) { constbool not_oop = false; return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
} // Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let // the hardware handle it. The two dwords within qwords that span // cache line boundaries will still be loaded and stored atomically. // // Side Effects: // disjoint_int_copy_entry is set to the no-overlap entry point // used by generate_conjoint_int_oop_copy(). //
address generate_disjoint_int_copy(bool aligned, address *entry, constchar *name, bool dest_uninitialized = false) { constbool not_oop = false; return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as ssize_t, can be zero // // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let // the hardware handle it. The two dwords within qwords that span // cache line boundaries will still be loaded and stored atomically. //
address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
address *entry, constchar *name, bool dest_uninitialized = false) { constbool not_oop = false; return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as size_t, can be zero // // Side Effects: // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the // no-overlap entry point used by generate_conjoint_long_oop_copy(). //
address generate_disjoint_long_copy(bool aligned, address *entry, constchar *name, bool dest_uninitialized = false) { constbool not_oop = false; return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as size_t, can be zero //
address generate_conjoint_long_copy(bool aligned,
address nooverlap_target, address *entry, constchar *name, bool dest_uninitialized = false) { constbool not_oop = false; return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as size_t, can be zero // // Side Effects: // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the // no-overlap entry point used by generate_conjoint_long_oop_copy(). //
address generate_disjoint_oop_copy(bool aligned, address *entry, constchar *name, bool dest_uninitialized) { constbool is_oop = true; constint size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
}
// Arguments: // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes // ignored // name - stub name string // // Inputs: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - element count, treated as size_t, can be zero //
address generate_conjoint_oop_copy(bool aligned,
address nooverlap_target, address *entry, constchar *name, bool dest_uninitialized) { constbool is_oop = true; constint size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
name, dest_uninitialized);
}
// Helper for generating a dynamic type check. // Smashes rscratch1, rscratch2. void generate_type_check(Register sub_klass, Register super_check_offset, Register super_klass,
Label& L_success) {
assert_different_registers(sub_klass, super_check_offset, super_klass);
// Registers used as temps (r19, r20, r21, r22 are save-on-entry) constRegister copied_oop = r22; // actual oop copied constRegister count_save = r21; // orig elementscount constRegister start_to = r20; // destination array start address constRegister r19_klass = r19; // oop._klass
//--------------------------------------------------------------- // Assembler stub will be used for this call to arraycopy // if the two arrays are subtypes of Object[] but the // destination array type is not equal to or a supertype // of the source type. Each element must be separately // checked.
assert_different_registers(from, to, count, ckoff, ckval, start_to,
copied_oop, r19_klass, count_save);
__ enter(); // required for proper stackwalking of RuntimeStub frame
#ifdef ASSERT // caller guarantees that the arrays really are different // otherwise, we would have to make conjoint checks
{ Label L;
__ b(L); // conjoint check not yet implemented
__ stop("checkcast_copy within a single array");
__ bind(L);
} #endif//ASSERT
// Caller of this entry point must set up the argument registers. if (entry != NULL) {
*entry = __ pc();
BLOCK_COMMENT("Entry:");
}
// ======== loop entry is here ========
__ BIND(L_load_element);
__ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
__ cbz(copied_oop, L_store_element);
__ load_klass(r19_klass, copied_oop);// query the object klass
generate_type_check(r19_klass, ckoff, ckval, L_store_element); // ======== end loop ========
// It was a real error; we must depend on the caller to finish the job. // Register count = remaining oops, count_orig = total oops. // Emit GC store barriers for the oops we have copied and report // their number to the caller.
// Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
__ movw(src_pos, src_pos);
__ movw(dst_pos, dst_pos);
BLOCK_COMMENT("arraycopy_range_checks done");
}
// These stubs get called from some dumb test routine. // I'll write them properly when they're called from // something that's actually doing something. staticvoid fake_arraycopy_stub(address src, address dst, int count) {
assert(count == 0, "huh?");
}
// // Generate 'unsafe' array copy stub // Though just as safe as the other stubs, it takes an unscaled // size_t argument instead of an element count. // // Input: // c_rarg0 - source array address // c_rarg1 - destination array address // c_rarg2 - byte count, treated as ssize_t, can be zero // // Examines the alignment of the operands and dispatches // to a long, int, short, or byte copy loop. //
address generate_unsafe_copy(constchar *name,
address byte_copy_entry,
address short_copy_entry,
address int_copy_entry,
address long_copy_entry) {
Label L_long_aligned, L_int_aligned, L_short_aligned; Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
// Registers used as temps constRegister dst_klass = c_rarg5;
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
__ enter(); // required for proper stackwalking of RuntimeStub frame
// bump this on entry, not on exit:
inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
//----------------------------------------------------------------------- // Assembler stub will be used for this call to arraycopy // if the following conditions are met: // // (1) src and dst must not be null. // (2) src_pos must not be negative. // (3) dst_pos must not be negative. // (4) length must not be negative. // (5) src klass and dst klass should be the same and not NULL. // (6) src and dst should be arrays. // (7) src_pos + length must not exceed length of src. // (8) dst_pos + length must not exceed length of dst. //
// if (src == NULL) return -1;
__ cbz(src, L_failed);
// if (src_pos < 0) return -1;
__ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
// if (dst == NULL) return -1;
__ cbz(dst, L_failed);
// if (dst_pos < 0) return -1;
__ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
// registers used as temp constRegister scratch_length = r16; // elements count to copy constRegister scratch_src_klass = r17; // array klass constRegister lh = r15; // layout helper
// if (length < 0) return -1;
__ movw(scratch_length, length); // length (elements count, 32-bits value)
__ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
__ load_klass(scratch_src_klass, src); #ifdef ASSERT // assert(src->klass() != NULL);
{
BLOCK_COMMENT("assert klasses not null {");
Label L1, L2;
__ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL
__ bind(L1);
__ stop("broken null klass");
__ bind(L2);
__ load_klass(rscratch1, dst);
__ cbz(rscratch1, L1); // this would be broken also
BLOCK_COMMENT("} assert klasses not null done");
} #endif
// if (!src->is_Array()) return -1;
__ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
// At this point, it is known to be a typeArray (array_tag 0x3). #ifdef ASSERT
{
BLOCK_COMMENT("assert primitive array {");
Label L;
__ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
__ cmpw(lh, rscratch2);
__ br(Assembler::GE, L);
__ stop("must be a primitive array");
__ bind(L);
BLOCK_COMMENT("} assert primitive array done");
} #endif
__ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
__ add(src, src, rscratch1_offset); // src array offset
__ add(dst, dst, rscratch1_offset); // dst array offset
BLOCK_COMMENT("choose copy loop based on element size");
// next registers should be set before the jump to corresponding stub constRegister from = c_rarg0; // source array address constRegister to = c_rarg1; // destination array address constRegister count = c_rarg2; // elements count
// 'from', 'to', 'count' registers should be set in such order // since they are the same as 'src', 'src_pos', 'dst'.
assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
// The possible values of elsize are 0-3, i.e. exact_log2(element // size in bytes). We do a simple bitwise binary search.
__ BIND(L_copy_bytes);
__ tbnz(r15_elsize, 1, L_copy_ints);
__ tbnz(r15_elsize, 0, L_copy_shorts);
__ lea(from, Address(src, src_pos));// src_addr
__ lea(to, Address(dst, dst_pos));// dst_addr
__ movw(count, scratch_length); // length
__ b(RuntimeAddress(byte_copy_entry));
__ BIND(L_checkcast_copy); // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
{ // Before looking at dst.length, make sure dst is also an objArray.
__ ldrw(rscratch1, Address(r15, lh_offset));
__ movw(rscratch2, objArray_lh);
__ eorw(rscratch1, rscratch1, rscratch2);
__ cbnzw(rscratch1, L_failed);
// It is safe to examine both src.length and dst.length.
arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
r15, L_failed);
__ load_klass(dst_klass, dst); // reload
// Marshal the base address arguments now, freeing registers.
__ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
__ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
__ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
__ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
__ movw(count, length); // length (reloaded) Register sco_temp = c_rarg3; // this register is free now
assert_different_registers(from, to, count, sco_temp,
dst_klass, scratch_src_klass); // assert_clean_int(count, sco_temp);
// Generate the type check. constint sco_offset = in_bytes(Klass::super_check_offset_offset());
__ ldrw(sco_temp, Address(dst_klass, sco_offset));
// Fetch destination element klass from the ObjArrayKlass header. int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
__ ldr(dst_klass, Address(dst_klass, ek_offset));
__ ldrw(sco_temp, Address(dst_klass, sco_offset));
// the checkcast_copy loop needs two extra arguments:
assert(c_rarg3 == sco_temp, "#3 already in place"); // Set up arguments for checkcast_copy_entry.
__ mov(c_rarg4, dst_klass); // dst.klass.element_klass
__ b(RuntimeAddress(checkcast_copy_entry));
}
__ BIND(L_failed);
__ mov(r0, -1);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(lr);
return start;
}
// // Generate stub for array fill. If "aligned" is true, the // "to" address is assumed to be heapword aligned. // // Arguments for generated stub: // to: c_rarg0 // value: c_rarg1 // count: c_rarg2 treated as signed //
address generate_fill(BasicType t, bool aligned, constchar *name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
BLOCK_COMMENT("Entry:");
constRegister to = c_rarg0; // source array address constRegister value = c_rarg1; // value constRegister count = c_rarg2; // elements count
constRegister bz_base = r10; // base for block_zero routine constRegister cnt_words = r11; // temp register
__ enter();
Label L_fill_elements, L_exit1;
int shift = -1; switch (t) { case T_BYTE:
shift = 0;
__ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
__ bfi(value, value, 8, 8); // 8 bit -> 16 bit
__ bfi(value, value, 16, 16); // 16 bit -> 32 bit
__ br(Assembler::LO, L_fill_elements); break; case T_SHORT:
shift = 1;
__ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
__ bfi(value, value, 16, 16); // 16 bit -> 32 bit
__ br(Assembler::LO, L_fill_elements); break; case T_INT:
shift = 2;
__ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
__ br(Assembler::LO, L_fill_elements); break; default: ShouldNotReachHere();
}
// Align source address at 8 bytes address boundary.
Label L_skip_align1, L_skip_align2, L_skip_align4; if (!aligned) { switch (t) { case T_BYTE: // One byte misalignment happens only for byte arrays.
__ tbz(to, 0, L_skip_align1);
__ strb(value, Address(__ post(to, 1)));
__ subw(count, count, 1);
__ bind(L_skip_align1); // Fallthrough case T_SHORT: // Two bytes misalignment happens only for byte and short (char) arrays.
__ tbz(to, 1, L_skip_align2);
__ strh(value, Address(__ post(to, 2)));
__ subw(count, count, 2 >> shift);
__ bind(L_skip_align2); // Fallthrough case T_INT: // Align to 8 bytes, we know we are 4 byte aligned to start.
__ tbz(to, 2, L_skip_align4);
__ strw(value, Address(__ post(to, 4)));
__ subw(count, count, 4 >> shift);
__ bind(L_skip_align4); break; default: ShouldNotReachHere();
}
}
// // Fill large chunks //
__ lsrw(cnt_words, count, 3 - shift); // number of words
__ bfi(value, value, 32, 32); // 32 bit -> 64 bit
__ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); if (UseBlockZeroing) {
Label non_block_zeroing, rest; // If the fill value is zero we can use the fast zero_words().
__ cbnz(value, non_block_zeroing);
__ mov(bz_base, to);
__ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
__ zero_words(bz_base, cnt_words);
__ b(rest);
__ bind(non_block_zeroing);
__ fill_words(to, cnt_words, value);
__ bind(rest);
} else {
__ fill_words(to, cnt_words, value);
}
// Remaining count is less than 8 bytes. Fill it by a single store. // Note that the total length is no less than 8 bytes. if (t == T_BYTE || t == T_SHORT) {
Label L_exit1;
__ cbzw(count, L_exit1);
__ add(to, to, count, Assembler::LSL, shift); // points to the end
__ str(value, Address(to, -8)); // overwrite some elements
__ bind(L_exit1);
__ leave();
__ ret(lr);
}
//*** jint // Aligned versions
StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, "arrayof_jint_disjoint_arraycopy");
StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, "arrayof_jint_arraycopy"); // In 64 bit we need both aligned and unaligned versions of jint arraycopy. // entry_jint_arraycopy always points to the unaligned version
StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, "jint_disjoint_arraycopy");
StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry,
&entry_jint_arraycopy, "jint_arraycopy");
constRegister from = c_rarg0; // source array address constRegister to = c_rarg1; // destination array address constRegister key = c_rarg2; // key array address constRegister rvec = c_rarg3; // r byte array initialized from initvector array address // and left with the results of the last encryption block constRegister len_reg = c_rarg4; // src len (must be multiple of blocksize 16) constRegister keylen = rscratch1;
constRegister from = c_rarg0; // source array address constRegister to = c_rarg1; // destination array address constRegister key = c_rarg2; // key array address constRegister rvec = c_rarg3; // r byte array initialized from initvector array address // and left with the results of the last encryption block constRegister len_reg = c_rarg4; // src len (must be multiple of blocksize 16) constRegister keylen = rscratch1;
constunsignedchar block_size = 16; constint bulk_width = 4; // NB: bulk_width can be 4 or 8. 8 gives slightly faster // performance with larger data sizes, but it also means that the // fast path isn't used until you have at least 8 blocks, and up // to 127 bytes of data will be executed on the slow path. For // that reason, and also so as not to blow away too much icache, 4 // blocks seems like a sensible compromise.
// Algorithm: // // if (len == 0) { // goto DONE; // } // int result = len; // do { // if (used >= blockSize) { // if (len >= bulk_width * blockSize) { // CTR_large_block(); // if (len == 0) // goto DONE; // } // for (;;) { // 16ByteVector v0 = counter; // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0); // used = 0; // if (len < blockSize) // break; /* goto NEXT */ // 16ByteVector v1 = load16Bytes(in, offset); // v1 = v1 ^ encryptedCounter; // store16Bytes(out, offset); // used = blockSize; // offset += blockSize; // len -= blockSize; // if (len == 0) // goto DONE; // } // } // NEXT: // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]); // len--; // } while (len != 0); // DONE: // return result; // // CTR_large_block() // Wide bulk encryption of whole blocks.
// Compute #rounds for AES based on the length of the key array
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ aesenc_loadkeys(key, keylen);
{
Label L_CTR_loop, NEXT;
__ bind(L_CTR_loop);
__ cmp(used, block_size);
__ br(__ LO, NEXT);
// Maybe we have a lot of data
__ subsw(rscratch1, len, bulk_width * block_size);
__ br(__ HS, CTR_large_block);
__ BIND(large_block_return);
__ cbzw(len, DONE);
__ ld1(v0, __ T16B, counter); // Load the counter into v0
__ rev32(v16, __ T16B, v0);
__ addv(v16, __ T4S, v16, v4);
__ rev32(v16, __ T16B, v16);
__ st1(v16, __ T16B, counter); // Save the incremented counter back
{ // We have fewer than bulk_width blocks of data left. Encrypt // them one by one until there is less than a full block // remaining, being careful to save both the encrypted counter // and the counter.
Label inner_loop;
__ bind(inner_loop); // Counter to encrypt is in v0
__ aesecb_encrypt(noreg, noreg, keylen);
__ st1(v0, __ T16B, saved_encrypted_ctr);
// Vector AES Galois Counter Mode implementation. Parameters: // // in = c_rarg0 // len = c_rarg1 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt) // out = c_rarg3 // key = c_rarg4 // state = c_rarg5 - GHASH.state // subkeyHtbl = c_rarg6 - powers of H // counter = c_rarg7 - 16 bytes of CTR // return - number of processed bytes
address generate_galoisCounterMode_AESCrypt() {
address ghash_polynomial = __ pc();
__ emit_int64(0x87); // The low-order bits of the field // polynomial (i.e. p = z^7+z^2+z+1) // repeated in the low and high parts of a // 128-bit vector
__ emit_int64(0x87);
constRegister in = c_rarg0; constRegister len = c_rarg1; constRegister ct = c_rarg2; constRegister out = c_rarg3; // and updated with the incremented counter in the end
constRegister key = c_rarg4; constRegister state = c_rarg5;
// Compute #rounds for AES based on the length of the key array
__ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(lr);
return start;
}
// ChaCha20 block function. This version parallelizes by loading // individual 32-bit state elements into vectors for four blocks // (e.g. all four blocks' worth of state[0] in one register, etc.) // // state (int[16]) = c_rarg0 // keystream (byte[1024]) = c_rarg1 // return - number of bytes of keystream (always 256)
address generate_chacha20Block_blockpar() {
Label L_twoRounds, L_cc20_const; // The constant data is broken into two 128-bit segments to be loaded // onto FloatRegisters. The first 128 bits are a counter add overlay // that adds +0/+1/+2/+3 to the vector holding replicated state[12]. // The second 128-bits is a table constant used for 8-bit left rotations.
__ BIND(L_cc20_const);
__ emit_int64(0x0000000100000000UL);
__ emit_int64(0x0000000300000002UL);
__ emit_int64(0x0605040702010003UL);
__ emit_int64(0x0E0D0C0F0A09080BUL);
// Organize SIMD registers in an array that facilitates // putting repetitive opcodes into loop structures. It is // important that each grouping of 4 registers is monotonically // increasing to support the requirements of multi-register // instructions (e.g. ld4r, st4, etc.) const FloatRegister workSt[16] = {
v4, v5, v6, v7, v16, v17, v18, v19,
v20, v21, v22, v23, v24, v25, v26, v27
};
// Load from memory and interlace across 16 SIMD registers, // With each word from memory being broadcast to all lanes of // each successive SIMD register. // Addr(0) -> All lanes in workSt[i] // Addr(4) -> All lanes workSt[i + 1], etc.
__ mov(tmpAddr, state); for (i = 0; i < 16; i += 4) {
__ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
__ post(tmpAddr, 16));
}
// Pull in constant data. The first 16 bytes are the add overlay // which is applied to the vector holding the counter (state[12]). // The second 16 bytes is the index register for the 8-bit left // rotation tbl instruction.
__ adr(tmpAddr, L_cc20_const);
__ ldpq(origCtrState, lrot8Tbl, Address(tmpAddr));
__ addv(workSt[12], __ T4S, workSt[12], origCtrState);
// Set up the 10 iteration loop and perform all 8 quarter round ops
__ mov(loopCtr, 10);
__ BIND(L_twoRounds);
// Add the starting state back to the post-loop keystream // state. We read/interlace the state array from memory into // 4 registers similar to what we did in the beginning. Then // add the counter overlay onto workSt[12] at the end. for (i = 0; i < 16; i += 4) {
__ ld4r(stateFirst, stateSecond, stateThird, stateFourth, __ T4S,
__ post(tmpAddr, 16));
__ addv(workSt[i], __ T4S, workSt[i], stateFirst);
__ addv(workSt[i + 1], __ T4S, workSt[i + 1], stateSecond);
__ addv(workSt[i + 2], __ T4S, workSt[i + 2], stateThird);
__ addv(workSt[i + 3], __ T4S, workSt[i + 3], stateFourth);
}
__ addv(workSt[12], __ T4S, workSt[12], origCtrState); // Add ctr mask
// Write to key stream, storing the same element out of workSt[0..15] // to consecutive 4-byte offsets in the key stream buffer, then repeating // for the next element position. for (i = 0; i < 4; i++) { for (j = 0; j < 16; j += 4) {
__ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
__ post(keystream, 16));
}
}
// Max number of bytes we can process before having to take the mod // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
uint64_t BASE = 0xfff1;
uint64_t NMAX = 0x15B0;
__ mov(base, BASE);
__ mov(nmax, NMAX);
// Load accumulation coefficients for the upper 16 bits
__ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
__ ld1(vtable, __ T16B, Address(temp0));
// s1 is initialized to the lower 16 bits of adler // s2 is initialized to the upper 16 bits of adler
__ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
__ uxth(s1, adler); // s1 = (adler & 0xffff)
// The pipelined loop needs at least 16 elements for 1 iteration // It does check this, but it is more effective to skip to the cleanup loop
__ cmp(len, (u1)16);
__ br(Assembler::HS, L_nmax);
__ cbz(len, L_combine);
BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame
__ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(lr);
return start;
}
address generate_squareToLen() { // squareToLen algorithm for sizes 1..127 described in java code works // faster than multiply_to_len on some CPUs and slower on others, but // multiply_to_len shows a bit better overall results
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "squareToLen");
address start = __ pc();
constRegister x = r0; constRegister xlen = r1; constRegister z = r2; constRegister zlen = r3; constRegister y = r4; // == x constRegister ylen = r5; // == xlen
__ cmp(len, (u1)15);
__ br(Assembler::GT, LEN_OVER_15); // The only case when execution falls into this code is when pointer is near // the end of memory page and we have to avoid reading next page
__ add(ary1, ary1, len);
__ subs(len, len, 8);
__ br(Assembler::GT, LEN_OVER_8);
__ ldr(rscratch2, Address(ary1, -8));
__ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
__ lsrv(rscratch2, rscratch2, rscratch1);
__ tst(rscratch2, UPPER_BIT_MASK);
__ csel(result, zr, result, Assembler::NE);
__ leave();
__ ret(lr);
__ bind(LEN_OVER_8);
__ ldp(rscratch1, rscratch2, Address(ary1, -16));
__ sub(len, len, 8); // no data dep., then sub can be executed while loading
__ tst(rscratch2, UPPER_BIT_MASK);
__ br(Assembler::NE, RET_NO_POP);
__ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
__ lsrv(rscratch1, rscratch1, rscratch2);
__ tst(rscratch1, UPPER_BIT_MASK);
__ bind(RET_NO_POP);
__ csel(result, zr, result, Assembler::NE);
__ leave();
__ ret(lr);
__ bind(ALIGNED);
__ cmp(len, large_loop_size);
__ br(Assembler::LT, CHECK_16); // Perform 16-byte load as early return in pre-loop to handle situation // when initially aligned large array has negative values at starting bytes, // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is // slower. Cases with negative bytes further ahead won't be affected that // much. In fact, it'll be faster due to early loads, less instructions and // less branches in LARGE_LOOP.
__ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
__ sub(len, len, 16);
__ orr(tmp6, tmp6, tmp1);
__ tst(tmp6, UPPER_BIT_MASK);
__ br(Assembler::NE, RET_ADJUST_16);
__ cmp(len, large_loop_size);
__ br(Assembler::LT, CHECK_16);
if (SoftwarePrefetchHintDistance >= 0
&& SoftwarePrefetchHintDistance >= dcache_line) { // initial prefetch
__ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
}
__ bind(LARGE_LOOP); if (SoftwarePrefetchHintDistance >= 0) {
__ prfm(Address(ary1, SoftwarePrefetchHintDistance));
} // Issue load instructions first, since it can save few CPU/MEM cycles, also // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 // instructions per cycle and have less branches, but this approach disables // early return, thus, all 64 bytes are loaded and checked every time.
__ ldp(tmp2, tmp3, Address(ary1));
__ ldp(tmp4, tmp5, Address(ary1, 16));
__ ldp(rscratch1, rscratch2, Address(ary1, 32));
__ ldp(tmp6, tmp1, Address(ary1, 48));
__ add(ary1, ary1, large_loop_size);
__ sub(len, len, large_loop_size);
__ orr(tmp2, tmp2, tmp3);
__ orr(tmp4, tmp4, tmp5);
__ orr(rscratch1, rscratch1, rscratch2);
__ orr(tmp6, tmp6, tmp1);
__ orr(tmp2, tmp2, tmp4);
__ orr(rscratch1, rscratch1, tmp6);
__ orr(tmp2, tmp2, rscratch1);
__ tst(tmp2, UPPER_BIT_MASK);
__ br(Assembler::NE, RET_ADJUST_LONG);
__ cmp(len, large_loop_size);
__ br(Assembler::GE, LARGE_LOOP);
__ ldrq(vtmp, Address(__ post(tmp2, 16)));
__ ldr(tmpU, Address(__ post(cnt1, 8)));
__ zip1(vtmp3, __ T16B, vtmp, vtmpZ); // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
__ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); // cnt2 == amount of characters left to compare // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
__ zip1(vtmp, __ T8B, vtmp, vtmpZ);
__ add(str1, str1, isLU ? wordSize/2 : wordSize);
__ add(str2, str2, isLU ? wordSize : wordSize/2);
__ fmovd(isLU ? tmp1 : tmp2, vtmp);
__ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
__ eor(rscratch2, tmp1, tmp2);
__ mov(rscratch1, tmp2);
__ cbnz(rscratch2, CALCULATE_DIFFERENCE); Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
__ push(spilled_regs, sp);
__ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
__ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
__ ldr(tmp3, Address(__ post(cnt1, 8)));
if (SoftwarePrefetchHintDistance >= 0) {
__ subs(rscratch2, cnt2, prefetchLoopExitCondition);
__ br(__ LT, NO_PREFETCH);
__ bind(LARGE_LOOP_PREFETCH);
__ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
__ mov(tmp4, 2);
__ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
__ bind(LARGE_LOOP_PREFETCH_REPEAT1);
compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
__ subs(tmp4, tmp4, 1);
__ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
__ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
__ mov(tmp4, 2);
__ bind(LARGE_LOOP_PREFETCH_REPEAT2);
compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
__ subs(tmp4, tmp4, 1);
__ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
__ sub(cnt2, cnt2, 64);
__ subs(rscratch2, cnt2, prefetchLoopExitCondition);
__ br(__ GE, LARGE_LOOP_PREFETCH);
}
__ cbz(cnt2, LOAD_LAST); // no characters left except last load
__ bind(NO_PREFETCH);
__ subs(cnt2, cnt2, 16);
__ br(__ LT, TAIL);
__ align(OptoLoopAlignment);
__ bind(SMALL_LOOP); // smaller loop
__ subs(cnt2, cnt2, 16);
compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
__ br(__ GE, SMALL_LOOP);
__ cmn(cnt2, (u1)16);
__ br(__ EQ, LOAD_LAST);
__ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
__ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
__ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
__ ldr(tmp3, Address(cnt1, -8));
compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
__ b(LOAD_LAST);
__ bind(DIFF2);
__ mov(tmpU, tmp3);
__ bind(DIFF1);
__ pop(spilled_regs, sp);
__ b(CALCULATE_DIFFERENCE);
__ bind(LOAD_LAST); // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. // No need to load it again
__ mov(tmpU, tmp3);
__ pop(spilled_regs, sp);
// tmp2 points to the address of the last 4 Latin1 characters right now
__ ldrs(vtmp, Address(tmp2));
__ zip1(vtmp, __ T8B, vtmp, vtmpZ);
__ fmovd(tmpL, vtmp);
if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); // We can get here despite the nmethod being good, if we have not // yet applied our cross modification fence (or data fence).
Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_offset()) + 4);
__ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
__ ldrw(rscratch2, rscratch2);
__ strw(rscratch2, thread_epoch_addr);
__ isb();
__ membar(__ LoadLoad);
}
__ set_last_Java_frame(sp, rfp, lr, rscratch1);
__ enter();
__ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
__ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
// exit from large loop when less than 64 bytes left to read or we're about // to prefetch memory behind array border int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
// before jumping to stub, pre-load 8 bytes already, so do comparison directly
__ eor(rscratch2, tmp1, tmp2);
__ cbnz(rscratch2, CAL_DIFFERENCE);
// Crop the vector to find its location.
__ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false/* isMerge */); // Extract the first different characters of each string.
__ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
__ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
// Compute the difference of the first different characters.
__ sub(result, rscratch1, rscratch2);
// R0 = result // R1 = str2 // R2 = cnt1 // R3 = str1 // R4 = cnt2 // This generic linear code use few additional ideas, which makes it faster: // 1) we can safely keep at least 1st register of pattern(since length >= 8) // in order to skip initial loading(help in systems with 1 ld pipeline) // 2) we can use "fast" algorithm of finding single character to search for // first symbol with less branches(1 branch per each loaded register instead // of branch for each symbol), so, this is where constants like // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from // 3) after loading and analyzing 1st register of source string, it can be // used to search for every 1st character entry, saving few loads in // comparison with "simplier-but-slower" implementation // 4) in order to avoid lots of push/pop operations, code below is heavily // re-using/re-initializing/compressing register values, which makes code // larger and a bit less readable, however, most of extra operations are // issued during loads or branches, so, penalty is minimal
address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { constchar* stubName = str1_isL
? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
: "indexof_linear_uu";
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stubName);
address entry = __ pc();
/** * Arguments: * * Input: * c_rarg0 - current state address * c_rarg1 - H key address * c_rarg2 - data address * c_rarg3 - number of blocks * * Output: * Updated state at c_rarg0
*/
address generate_ghash_processBlocks() { // Bafflingly, GCM uses little-endian for the byte order, but // big-endian for the bit order. For example, the polynomial 1 is // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. // // So, we must either reverse the bytes in each word and do // everything big-endian or reverse the bits in each byte and do // it little-endian. On AArch64 it's more idiomatic to reverse // the bits in each byte (we have an instruction, RBIT, to do // that) and keep the data in little-endian bit order through the // calculation, bit-reversing the inputs and outputs.
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
__ align(wordSize * 2);
address p = __ pc();
__ emit_int64(0x87); // The low-order bits of the field // polynomial (i.e. p = z^7+z^2+z+1) // repeated in the low and high parts of a // 128-bit vector
__ emit_int64(0x87);
__ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit // reversing each byte
__ rbit(v2, __ T16B, v2);
__ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
// Multiply state in v2 by subkey in v1
__ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4, /*temps*/v6, v3, /*reuse/clobber b*/v2); // Reduce v7:v5 by the field polynomial
__ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
// The bit-reversed result is at this point in v0
__ rev64(v0, __ T16B, v0);
__ rbit(v0, __ T16B, v0);
__ st1(v0, __ T16B, state);
__ ret(lr);
return start;
}
address generate_ghash_processBlocks_wide() {
address small = generate_ghash_processBlocks();
StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks_wide");
__ align(wordSize * 2);
address p = __ pc();
__ emit_int64(0x87); // The low-order bits of the field // polynomial (i.e. p = z^7+z^2+z+1) // repeated in the low and high parts of a // 128-bit vector
__ emit_int64(0x87);
// we need unsigned saturating subtract, to make sure all input values // in range [0, 63] will have 0U value in the higher half lookup
__ uqsubv(decH0, __ T16B, in0, v27);
__ uqsubv(decH1, __ T16B, in1, v27);
__ uqsubv(decH2, __ T16B, in2, v27);
__ uqsubv(decH3, __ T16B, in3, v27);
// A legal value of base64 code is in range [0, 127]. We need two lookups // with tbl/tbx and combine them to get the decode data. The 1st table vector // lookup use tbl, out of range indices are set to 0 in destination. The 2nd // table vector lookup use tbx, out of range indices are unchanged in // destination. Input [64..126] is mapped to index [65, 127] in second lookup. // The value of index 64 is set to 0, so that we know that we already get the // decoded data with the 1st lookup. staticconst uint8_t fromBase64ForSIMD[128] = {
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
};
// In the MIME case, the line length cannot be more than 76 // bytes (see RFC 2045). This is too short a block for SIMD // to be worthwhile, so we use non-SIMD here.
__ movw(rscratch1, 79);
// ARMv8.1 LSE versions of the atomic stubs used by Atomic::PlatformXX. // // If LSE is in use, generate LSE versions of all the stubs. The // non-LSE versions are in atomic_aarch64.S.
// class AtomicStubMark records the entry point of a stub and the // stub pointer which will point to it. The stub pointer is set to // the entry point when ~AtomicStubMark() is called, which must be // after ICache::invalidate_range. This ensures safe publication of // the generated code. class AtomicStubMark {
address _entry_point;
aarch64_atomic_stub_t *_stub;
MacroAssembler *_masm; public:
AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
_masm = masm;
__ align(32);
_entry_point = __ pc();
_stub = stub;
}
~AtomicStubMark() {
*_stub = (aarch64_atomic_stub_t)_entry_point;
}
};
// NB: For memory_order_conservative we need a trailing membar after // LSE atomic operations but not a leading membar. // // We don't need a leading membar because a clause in the Arm ARM // says: // // Barrier-ordered-before // // Barrier instructions order prior Memory effects before subsequent // Memory effects generated by the same Observer. A read or a write // RW1 is Barrier-ordered-before a read or a write RW 2 from the same // Observer if and only if RW1 appears in program order before RW 2 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic // instruction with both Acquire and Release semantics. // // All the atomic instructions {ldaddal, swapal, casal} have Acquire // and Release semantics, therefore we don't need a leading // barrier. However, there is no corresponding Barrier-ordered-after // relationship, therefore we need a trailing membar to prevent a // later store or load from being reordered with the store in an // atomic instruction. // // This was checked by using the herd7 consistency model simulator // (http://diy.inria.fr/) with this test case: // // AArch64 LseCas // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; } // P0 | P1; // LDR W4, [X2] | MOV W3, #0; // DMB LD | MOV W4, #1; // LDR W3, [X1] | CASAL W3, W4, [X1]; // | DMB ISH; // | STR W4, [X2]; // exists // (0:X3=0 /\ 0:X4=1) // // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered // with the store to x in P1. Without the DMB in P1 this may happen. // // At the time of writing we don't know of any AArch64 hardware that // reorders stores in this way, but the Reference Manual permits it.
if (return_barrier) { // preserve possible return value from a method returning to the return barrier
__ fmovd(rscratch1, v0);
__ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
}
__ movw(c_rarg1, (return_barrier ? 1 : 0));
__ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
__ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
if (return_barrier) { // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
__ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
__ fmovd(v0, rscratch1);
}
assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
Label thaw_success; // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
__ cbnz(rscratch2, thaw_success);
__ lea(rscratch1, ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
__ br(rscratch1);
__ bind(thaw_success);
// make room for the thawed frames
__ sub(rscratch1, sp, rscratch2);
__ andr(rscratch1, rscratch1, -16); // align
__ mov(sp, rscratch1);
if (return_barrier) { // save original return value -- again
__ fmovd(rscratch1, v0);
__ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
}
// If we want, we can templatize thaw by kind, and have three different entries
__ movw(c_rarg1, (uint32_t)kind);
__ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
__ mov(rscratch2, r0); // r0 is the sp of the yielding frame
if (return_barrier) { // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
__ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
__ fmovd(v0, rscratch1);
} else {
__ mov(r0, zr); // return 0 (success) from doYield
}
// we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
__ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
__ mov(rfp, sp);
if (return_barrier_exception) {
__ ldr(c_rarg1, Address(rfp, wordSize)); // return address
__ verify_oop(r0);
__ mov(r19, r0); // save return value contaning the exception oop in callee-saved R19
// Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code. // __ reinitialize_ptrue();
// see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
__ mov(r1, r0); // the exception handler
__ mov(r0, r19); // restore return value contaning the exception oop
__ verify_oop(r0);
__ leave();
__ mov(r3, lr);
__ br(r1); // the exception handler
} else { // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
__ leave();
__ ret(lr);
}
return start;
}
address generate_cont_thaw() { if (!Continuations::enabled()) return nullptr;
// For c2: c_rarg0 is junk, call to runtime to write a checkpoint. // It returns a jobject handle to the event writer. // The handle is dereferenced and the return value is the event writer oop. static RuntimeStub* generate_jfr_write_checkpoint() { enum layout {
rbp_off,
rbpH_off,
return_off,
return_off2,
framesize // inclusive of return address
};
int insts_size = 512; int locs_size = 64;
CodeBuffer code("jfr_write_checkpoint", insts_size, locs_size);
OopMapSet* oop_maps = new OopMapSet();
MacroAssembler* masm = new MacroAssembler(&code);
MacroAssembler* _masm = masm;
RuntimeStub* stub = // codeBlob framesize is in words (not VMRegImpl::slot_size)
RuntimeStub::new_runtime_stub("jfr_write_checkpoint", &code, frame_complete,
(framesize >> (LogBytesPerWord - LogBytesPerInt)),
oop_maps, false); return stub;
}
#endif// INCLUDE_JFR
// Continuation point for throwing of implicit exceptions that are // not handled in the current activation. Fabricates an exception // oop and initiates normal exception dispatching in this // frame. Since we need to preserve callee-saved values (currently // only for C2, but done for C1 as well) we need a callee-saved oop // map and therefore have to make these stubs into RuntimeStubs // rather than BufferBlobs. If the compiler needs all registers to // be preserved between the fault point and the exception handler // then it must assume responsibility for that in // AbstractCompiler::continuation_for_implicit_null_exception or // continuation_for_implicit_division_by_zero_exception. All other // implicit exceptions (e.g., NullPointerException or // AbstractMethodError on entry) are either at call sites or // otherwise assume that stack unwinding will be initiated, so // caller saved registers were assumed volatile in the compiler.
#undef __ #define __ masm->
address generate_throw_exception(constchar* name,
address runtime_entry, Register arg1 = noreg, Register arg2 = noreg) { // Information about frame layout at time of blocking runtime call. // Note that we only have to preserve callee-saved registers since // the compilers are responsible for supplying a continuation point // if they expect all registers to be preserved. // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 enum layout {
rfp_off = 0,
rfp_off2,
return_off,
return_off2,
framesize // inclusive of return address
};
int insts_size = 512; int locs_size = 64;
CodeBuffer code(name, insts_size, locs_size);
OopMapSet* oop_maps = new OopMapSet();
MacroAssembler* masm = new MacroAssembler(&code);
address start = __ pc();
// This is an inlined and slightly modified version of call_VM // which has the ability to fetch the return PC out of // thread-local storage and also sets up last_Java_sp slightly // differently than the real call_VM
__ enter(); // Save FP and LR before call
assert(is_even(framesize/2), "sp not 16-byte aligned");
// lr and fp are already in place
__ sub(sp, rfp, ((uint64_t)framesize-4) << LogBytesPerInt); // prolog
int frame_complete = __ pc() - start;
// Set up last_Java_sp and last_Java_fp
address the_pc = __ pc();
__ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
// Generate oop map
OopMap* map = new OopMap(framesize, 0);
oop_maps->add_gc_map(the_pc - start, map);
__ reset_last_Java_frame(true);
// Reinitialize the ptrue predicate register, in case the external runtime // call clobbers ptrue reg, as we may return to SVE compiled code.
__ reinitialize_ptrue();
// Zero the m*n result.
mov(Rhi_mn, zr);
mov(Rlo_mn, zr);
}
// The core multiply-accumulate step of a Montgomery // multiplication. The idea is to schedule operations as a // pipeline so that instructions with long latencies (loads and // multiplies) have time to complete before their results are // used. This most benefits in-order implementations of the // architecture but out-of-order ones also benefit. void step() {
block_comment("step"); // MACC(Ra, Rb, t0, t1, t2); // Ra = *++Pa; // Rb = *--Pb;
umulh(Rhi_ab, Ra, Rb);
mul(Rlo_ab, Ra, Rb);
ldr(Ra, pre(Pa, wordSize));
ldr(Rb, pre(Pb, -wordSize));
acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the // previous iteration. // MACC(Rm, Rn, t0, t1, t2); // Rm = *++Pm; // Rn = *--Pn;
umulh(Rhi_mn, Rm, Rn);
mul(Rlo_mn, Rm, Rn);
ldr(Rm, pre(Pm, wordSize));
ldr(Rn, pre(Pn, -wordSize));
acc(Rhi_ab, Rlo_ab, t0, t1, t2);
}
#ifndef PRODUCT // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
{
mul(Rlo_mn, Rm, Rn);
add(Rlo_mn, t0, Rlo_mn);
Label ok;
cbz(Rlo_mn, ok); {
stop("broken Montgomery multiply");
} bind(ok);
} #endif // We have very carefully set things up so that // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate // the lower half of Rm * Rn because we know the result already: // it must be -t0. t0 + (-t0) must generate a carry iff // t0 != 0. So, rather than do a mul and an adds we just set // the carry flag iff t0 is nonzero. // // mul(Rlo_mn, Rm, Rn); // adds(zr, t0, Rlo_mn);
subs(zr, t0, 1); // Set carry iff t0 is nonzero
adcs(t0, t1, Rhi_mn);
adc(t1, t2, zr);
mov(t2, zr);
}
void post2(RegisterOrConstant i, RegisterOrConstant len) {
block_comment("post2"); if (i.is_constant()) {
mov(Rj, i.as_constant()-len.as_constant());
} else {
sub(Rj, i.as_register(), len);
}
adds(t0, t0, Rlo_mn); // The pending m*n, low part
// As soon as we know the least significant digit of our result, // store it. // Pm_base[i-len] = t0;
str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
// t0 = t1; t1 = t2; t2 = 0;
adcs(t0, t1, Rhi_mn); // The pending m*n, high part
adc(t1, t2, zr);
mov(t2, zr);
}
// A carry in t0 after Montgomery multiplication means that we // should subtract multiples of n from our result in m. We'll // keep doing that until there is no carry. void normalize(RegisterOrConstant len) {
block_comment("normalize"); // while (t0) // t0 = sub(Pm_base, Pn_base, t0, len);
Label loop, post, again; Register cnt = t1, i = t2; // Re-use registers; we're done with them now
cbz(t0, post); {
bind(again); {
mov(i, zr);
mov(cnt, len);
ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
subs(zr, zr, zr); // set carry flag, i.e. no borrow
align(16);
bind(loop); {
sbcs(Rm, Rm, Rn);
str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
add(i, i, 1);
ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
sub(cnt, cnt, 1);
} cbnz(cnt, loop);
sbc(t0, t0, zr);
} cbnz(t0, again);
} bind(post);
}
// Move memory at s to d, reversing words. // Increments d to end of copied memory // Destroys tmp1, tmp2 // Preserves len // Leaves s pointing to the address which was in d at start void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
assert(tmp1->encoding() < r19->encoding(), "register corruption");
assert(tmp2->encoding() < r19->encoding(), "register corruption");
#ifndef PRODUCT // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
{
mul(Rlo_mn, Rm, Rn);
add(Rlo_mn, t0, Rlo_mn);
Label ok;
cbz(Rlo_mn, ok); {
stop("broken Montgomery multiply");
} bind(ok);
} #endif // We have very carefully set things up so that // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate // the lower half of Rm * Rn because we know the result already: // it must be -t0. t0 + (-t0) must generate a carry iff // t0 != 0. So, rather than do a mul and an adds we just set // the carry flag iff t0 is nonzero. // // mul(Rlo_mn, Rm, Rn); // adds(zr, t0, Rlo_mn);
subs(zr, t0, 1); // Set carry iff t0 is nonzero
adcs(t0, t1, Rhi_mn);
adc(t1, t2, zr);
mov(t2, zr);
}
public: /** * Fast Montgomery multiplication. The derivation of the * algorithm is in A Cryptographic Library for the Motorola * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. * * Arguments: * * Inputs for multiplication: * c_rarg0 - int array elements a * c_rarg1 - int array elements b * c_rarg2 - int array elements n (the modulus) * c_rarg3 - int length * c_rarg4 - int inv * c_rarg5 - int array elements m (the result) * * Inputs for squaring: * c_rarg0 - int array elements a * c_rarg1 - int array elements n (the modulus) * c_rarg2 - int length * c_rarg3 - int inv * c_rarg4 - int array elements m (the result) *
*/
address generate_multiply() {
Label argh, nothing;
bind(argh);
stop("MontgomeryMultiply total_allocation must be <= 8192");
lsrw(Rlen, Rlen, 1); // length in longwords = len/2
{ // Copy input args, reversing as we go. We use Ra as a // temporary variable.
reverse(Ra, Pa_base, Rlen, t0, t1); if (!_squaring)
reverse(Ra, Pb_base, Rlen, t0, t1);
reverse(Ra, Pn_base, Rlen, t0, t1);
}
// Push all call-saved registers and also Pm_base which we'll need // at the end.
save_regs();
/** * Fast Montgomery squaring. This uses asymptotically 25% fewer * multiplies than Montgomery multiplication so it should be up to * 25% faster. However, its loop control is more complex and it * may actually run slower on some machines. * * Arguments: * * Inputs: * c_rarg0 - int array elements a * c_rarg1 - int array elements n (the modulus) * c_rarg2 - int length * c_rarg3 - int inv * c_rarg4 - int array elements m (the result) *
*/
address generate_square() {
Label argh;
bind(argh);
stop("MontgomeryMultiply total_allocation must be <= 8192");
lsrw(Rlen, Rlen, 1); // length in longwords = len/2
{ // Copy input args, reversing as we go. We use Ra as a // temporary variable.
reverse(Ra, Pa_base, Rlen, t0, t1);
reverse(Ra, Pn_base, Rlen, t0, t1);
}
// Push all call-saved registers and also Pm_base which we'll need // at the end.
save_regs();
// Initialization void generate_initial() { // Generate initial stubs and initializes the entry points
// entry points that exist in all platforms Note: This is code // that could be shared among different platforms - however the // benefit seems to be smaller than the disadvantage of having a // much more complicated generator structure. See also comment in // stubRoutines.hpp.
// is referenced by megamorphic call
StubRoutines::_catch_exception_entry = generate_catch_exception();
// Build this early so it's available for the interpreter.
StubRoutines::_throw_StackOverflowError_entry =
generate_throw_exception("StackOverflowError throw_exception",
CAST_FROM_FN_PTR(address,
SharedRuntime::throw_StackOverflowError));
StubRoutines::_throw_delayed_StackOverflowError_entry =
generate_throw_exception("delayed StackOverflowError throw_exception",
CAST_FROM_FN_PTR(address,
SharedRuntime::throw_delayed_StackOverflowError)); if (UseCRC32Intrinsics) { // set table address before stub generation which use it
StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
}
if (UseCRC32CIntrinsics) {
StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
}
// Disabled until JDK-8210858 is fixed // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { // StubRoutines::_dlog = generate_dlog(); // }
StubRoutines::_throw_NullPointerException_at_call_entry =
generate_throw_exception("NullPointerException at call throw_exception",
CAST_FROM_FN_PTR(address,
SharedRuntime::
throw_NullPointerException_at_call));
if (UseSVE == 0) {
StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
}
// arraycopy stubs used by compilers
generate_arraycopy_stubs();
// countPositives stub for large arrays.
StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
// array equals stub for large arrays. if (!UseSimpleArrayEquals) {
StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
}
generate_compare_long_strings();
generate_string_indexof_stubs();
// byte_array_inflate stub for large arrays.
StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
if (UseMontgomerySquareIntrinsic) {
StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); // We use generate_multiply() rather than generate_square() // because it's faster for the sizes of modulus we care about.
StubRoutines::_montgomerySquare = g.generate_multiply();
} #endif// COMPILER2
if (UseChaCha20Intrinsics) {
StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
}
if (UseBASE64Intrinsics) {
StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
}
// data cache line writeback
StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
public:
StubGenerator(CodeBuffer* code, int phase) : StubCodeGenerator(code) { if (phase == 0) {
generate_initial();
} elseif (phase == 1) {
generate_phase1(); // stubs that must be available for the interpreter
} else {
generate_all();
}
}
}; // end class declaration
#define UCM_TABLE_MAX_ENTRIES 8 void StubGenerator_generate(CodeBuffer* code, int phase) { if (UnsafeCopyMemory::_table == NULL) {
UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
}
StubGenerator g(code, phase);
}
#ifdefined (LINUX)
// Define pointers to atomic stubs and initialize them to point to the // code in atomic_aarch64.S.
¤ Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.0.205Bemerkung:
(vorverarbeitet am 2026-04-26)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.