Quelle stubGenerator_aarch64.cpp Sprache: C

/*
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2022, Red Hat Inc. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/

#include "precompiled.hpp"
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "asm/register.hpp"
#include "atomic_aarch64.hpp"
#include "compiler/oopMap.hpp"
#include "gc/shared/barrierSet.hpp"
#include "gc/shared/barrierSetAssembler.hpp"
#include "gc/shared/gc_globals.hpp"
#include "gc/shared/tlab_globals.hpp"
#include "interpreter/interpreter.hpp"
#include "memory/universe.hpp"
#include "nativeInst_aarch64.hpp"
#include "oops/instanceOop.hpp"
#include "oops/method.hpp"
#include "oops/objArrayKlass.hpp"
#include "oops/oop.inline.hpp"
#include "prims/methodHandles.hpp"
#include "runtime/atomic.hpp"
#include "runtime/continuation.hpp"
#include "runtime/continuationEntry.inline.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/handles.inline.hpp"
#include "runtime/javaThread.hpp"
#include "runtime/sharedRuntime.hpp"
#include "runtime/stubCodeGenerator.hpp"
#include "runtime/stubRoutines.hpp"
#include "utilities/align.hpp"
#include "utilities/globalDefinitions.hpp"
#include "utilities/powerOfTwo.hpp"
#ifdef COMPILER2
#include "opto/runtime.hpp"
#endif
#if INCLUDE_ZGC
#include "gc/z/zThreadLocalData.hpp"
#endif

// Declaration and definition of StubGenerator (no .hpp file).
// For a more detailed description of the stub routine structure
// see the comment in stubRoutines.hpp

#undef __
#define __ _masm->

#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) __ block_comment(str)
#endif

#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")

// Stub Code definitions

class StubGenerator: public StubCodeGenerator {
private:

#ifdef PRODUCT
#define inc_counter_np(counter) ((void)0)
#else
  void inc_counter_np_(int& counter) {
    __ lea(rscratch2, ExternalAddress((address)&counter));
    __ ldrw(rscratch1, Address(rscratch2));
    __ addw(rscratch1, rscratch1, 1);
    __ strw(rscratch1, Address(rscratch2));
  }
#define inc_counter_np(counter) \
  BLOCK_COMMENT("inc_counter " #counter); \
  inc_counter_np_(counter);
#endif

  // Call stubs are used to call Java from C
  //
  // Arguments:
  //    c_rarg0:   call wrapper address                   address
  //    c_rarg1:   result                                 address
  //    c_rarg2:   result type                            BasicType
  //    c_rarg3:   method                                 Method*
  //    c_rarg4:   (interpreter) entry point              address
  //    c_rarg5:   parameters                             intptr_t*
  //    c_rarg6:   parameter size (in words)              int
  //    c_rarg7:   thread                                 Thread*
  //
  // There is no return from the stub itself as any Java result
  // is written to result
  //
  // we save r30 (lr) as the return PC at the base of the frame and
  // link r29 (fp) below it as the frame pointer installing sp (r31)
  // into fp.
  //
  // we save r0-r7, which accounts for all the c arguments.
  //
  // TODO: strictly do we need to save them all? they are treated as
  // volatile by C so could we omit saving the ones we are going to
  // place in global registers (thread? method?) or those we only use
  // during setup of the Java call?
  //
  // we don't need to save r8 which C uses as an indirect result location
  // return register.
  //
  // we don't need to save r9-r15 which both C and Java treat as
  // volatile
  //
  // we don't need to save r16-18 because Java does not use them
  //
  // we save r19-r28 which Java uses as scratch registers and C
  // expects to be callee-save
  //
  // we save the bottom 64 bits of each value stored in v8-v15; it is
  // the responsibility of the caller to preserve larger values.
  //
  // so the stub frame looks like this when we enter Java code
  //
  //     [ return_from_Java     ] <--- sp
  //     [ argument word n      ]
  //      ...
  // -27 [ argument word 1      ]
  // -26 [ saved v15            ] <--- sp_after_call
  // -25 [ saved v14            ]
  // -24 [ saved v13            ]
  // -23 [ saved v12            ]
  // -22 [ saved v11            ]
  // -21 [ saved v10            ]
  // -20 [ saved v9             ]
  // -19 [ saved v8             ]
  // -18 [ saved r28            ]
  // -17 [ saved r27            ]
  // -16 [ saved r26            ]
  // -15 [ saved r25            ]
  // -14 [ saved r24            ]
  // -13 [ saved r23            ]
  // -12 [ saved r22            ]
  // -11 [ saved r21            ]
  // -10 [ saved r20            ]
  //  -9 [ saved r19            ]
  //  -8 [ call wrapper    (r0) ]
  //  -7 [ result          (r1) ]
  //  -6 [ result type     (r2) ]
  //  -5 [ method          (r3) ]
  //  -4 [ entry point     (r4) ]
  //  -3 [ parameters      (r5) ]
  //  -2 [ parameter size  (r6) ]
  //  -1 [ thread (r7)          ]
  //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
  //   1 [ saved lr       (r30) ]

  // Call stub stack layout word offsets from fp
  enum call_stub_layout {
    sp_after_call_off = -26,

    d15_off            = -26,
    d13_off            = -24,
    d11_off            = -22,
    d9_off             = -20,

    r28_off            = -18,
    r26_off            = -16,
    r24_off            = -14,
    r22_off            = -12,
    r20_off            = -10,
    call_wrapper_off   =  -8,
    result_off         =  -7,
    result_type_off    =  -6,
    method_off         =  -5,
    entry_point_off    =  -4,
    parameter_size_off =  -2,
    thread_off         =  -1,
    fp_f               =   0,
    retaddr_off        =   1,
  };

  address generate_call_stub(address& return_address) {
    assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
           (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
           "adjust this code");

    StubCodeMark mark(this, "StubRoutines", "call_stub");
    address start = __ pc();

    const Address sp_after_call(rfp, sp_after_call_off * wordSize);

    const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
    const Address result        (rfp, result_off         * wordSize);
    const Address result_type   (rfp, result_type_off    * wordSize);
    const Address method        (rfp, method_off         * wordSize);
    const Address entry_point   (rfp, entry_point_off    * wordSize);
    const Address parameter_size(rfp, parameter_size_off * wordSize);

    const Address thread        (rfp, thread_off         * wordSize);

    const Address d15_save      (rfp, d15_off * wordSize);
    const Address d13_save      (rfp, d13_off * wordSize);
    const Address d11_save      (rfp, d11_off * wordSize);
    const Address d9_save       (rfp, d9_off * wordSize);

    const Address r28_save      (rfp, r28_off * wordSize);
    const Address r26_save      (rfp, r26_off * wordSize);
    const Address r24_save      (rfp, r24_off * wordSize);
    const Address r22_save      (rfp, r22_off * wordSize);
    const Address r20_save      (rfp, r20_off * wordSize);

    // stub code

    address aarch64_entry = __ pc();

    // set up frame and move sp to end of save area
    __ enter();
    __ sub(sp, rfp, -sp_after_call_off * wordSize);

    // save register parameters and Java scratch/global registers
    // n.b. we save thread even though it gets installed in
    // rthread because we want to sanity check rthread later
    __ str(c_rarg7,  thread);
    __ strw(c_rarg6, parameter_size);
    __ stp(c_rarg4, c_rarg5,  entry_point);
    __ stp(c_rarg2, c_rarg3,  result_type);
    __ stp(c_rarg0, c_rarg1,  call_wrapper);

    __ stp(r20, r19,   r20_save);
    __ stp(r22, r21,   r22_save);
    __ stp(r24, r23,   r24_save);
    __ stp(r26, r25,   r26_save);
    __ stp(r28, r27,   r28_save);

    __ stpd(v9,  v8,   d9_save);
    __ stpd(v11, v10,  d11_save);
    __ stpd(v13, v12,  d13_save);
    __ stpd(v15, v14,  d15_save);

    // install Java thread in global register now we have saved
    // whatever value it held
    __ mov(rthread, c_rarg7);
    // And method
    __ mov(rmethod, c_rarg3);

    // set up the heapbase register
    __ reinit_heapbase();

#ifdef ASSERT
    // make sure we have no pending exceptions
    {
      Label L;
      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
      __ cmp(rscratch1, (u1)NULL_WORD);
      __ br(Assembler::EQ, L);
      __ stop("StubRoutines::call_stub: entered with pending exception");
      __ BIND(L);
    }
#endif
    // pass parameters if any
    __ mov(esp, sp);
    __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
    __ andr(sp, rscratch1, -2 * wordSize);

    BLOCK_COMMENT("pass parameters if any");
    Label parameters_done;
    // parameter count is still in c_rarg6
    // and parameter pointer identifying param 1 is in c_rarg5
    __ cbzw(c_rarg6, parameters_done);

    address loop = __ pc();
    __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
    __ subsw(c_rarg6, c_rarg6, 1);
    __ push(rscratch1);
    __ br(Assembler::GT, loop);

    __ BIND(parameters_done);

    // call Java entry -- passing methdoOop, and current sp
    //      rmethod: Method*
    //      r19_sender_sp: sender sp
    BLOCK_COMMENT("call Java function");
    __ mov(r19_sender_sp, sp);
    __ blr(c_rarg4);

    // we do this here because the notify will already have been done
    // if we get to the next instruction via an exception
    //
    // n.b. adding this instruction here affects the calculation of
    // whether or not a routine returns to the call stub (used when
    // doing stack walks) since the normal test is to check the return
    // pc against the address saved below. so we may need to allow for
    // this extra instruction in the check.

    // save current address for use by exception handling code

    return_address = __ pc();

    // store result depending on type (everything that is not
    // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
    // n.b. this assumes Java returns an integral result in r0
    // and a floating result in j_farg0
    __ ldr(j_rarg2, result);
    Label is_long, is_float, is_double, exit;
    __ ldr(j_rarg1, result_type);
    __ cmp(j_rarg1, (u1)T_OBJECT);
    __ br(Assembler::EQ, is_long);
    __ cmp(j_rarg1, (u1)T_LONG);
    __ br(Assembler::EQ, is_long);
    __ cmp(j_rarg1, (u1)T_FLOAT);
    __ br(Assembler::EQ, is_float);
    __ cmp(j_rarg1, (u1)T_DOUBLE);
    __ br(Assembler::EQ, is_double);

    // handle T_INT case
    __ strw(r0, Address(j_rarg2));

    __ BIND(exit);

    // pop parameters
    __ sub(esp, rfp, -sp_after_call_off * wordSize);

#ifdef ASSERT
    // verify that threads correspond
    {
      Label L, S;
      __ ldr(rscratch1, thread);
      __ cmp(rthread, rscratch1);
      __ br(Assembler::NE, S);
      __ get_thread(rscratch1);
      __ cmp(rthread, rscratch1);
      __ br(Assembler::EQ, L);
      __ BIND(S);
      __ stop("StubRoutines::call_stub: threads must correspond");
      __ BIND(L);
    }
#endif

    __ pop_cont_fastpath(rthread);

    // restore callee-save registers
    __ ldpd(v15, v14,  d15_save);
    __ ldpd(v13, v12,  d13_save);
    __ ldpd(v11, v10,  d11_save);
    __ ldpd(v9,  v8,   d9_save);

    __ ldp(r28, r27,   r28_save);
    __ ldp(r26, r25,   r26_save);
    __ ldp(r24, r23,   r24_save);
    __ ldp(r22, r21,   r22_save);
    __ ldp(r20, r19,   r20_save);

    __ ldp(c_rarg0, c_rarg1,  call_wrapper);
    __ ldrw(c_rarg2, result_type);
    __ ldr(c_rarg3,  method);
    __ ldp(c_rarg4, c_rarg5,  entry_point);
    __ ldp(c_rarg6, c_rarg7,  parameter_size);

    // leave frame and return to caller
    __ leave();
    __ ret(lr);

    // handle return types different from T_INT

    __ BIND(is_long);
    __ str(r0, Address(j_rarg2, 0));
    __ br(Assembler::AL, exit);

    __ BIND(is_float);
    __ strs(j_farg0, Address(j_rarg2, 0));
    __ br(Assembler::AL, exit);

    __ BIND(is_double);
    __ strd(j_farg0, Address(j_rarg2, 0));
    __ br(Assembler::AL, exit);

    return start;
  }

  // Return point for a Java call if there's an exception thrown in
  // Java code.  The exception is caught and transformed into a
  // pending exception stored in JavaThread that can be tested from
  // within the VM.
  //
  // Note: Usually the parameters are removed by the callee. In case
  // of an exception crossing an activation frame boundary, that is
  // not the case if the callee is compiled code => need to setup the
  // rsp.
  //
  // r0: exception oop

  address generate_catch_exception() {
    StubCodeMark mark(this, "StubRoutines", "catch_exception");
    address start = __ pc();

    // same as in generate_call_stub():
    const Address sp_after_call(rfp, sp_after_call_off * wordSize);
    const Address thread        (rfp, thread_off         * wordSize);

#ifdef ASSERT
    // verify that threads correspond
    {
      Label L, S;
      __ ldr(rscratch1, thread);
      __ cmp(rthread, rscratch1);
      __ br(Assembler::NE, S);
      __ get_thread(rscratch1);
      __ cmp(rthread, rscratch1);
      __ br(Assembler::EQ, L);
      __ bind(S);
      __ stop("StubRoutines::catch_exception: threads must correspond");
      __ bind(L);
    }
#endif

    // set pending exception
    __ verify_oop(r0);

    __ str(r0, Address(rthread, Thread::pending_exception_offset()));
    __ mov(rscratch1, (address)__FILE__);
    __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
    __ movw(rscratch1, (int)__LINE__);
    __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));

    // complete return to VM
    assert(StubRoutines::_call_stub_return_address != NULL,
           "_call_stub_return_address must have been generated before");
    __ b(StubRoutines::_call_stub_return_address);

    return start;
  }

  // Continuation point for runtime calls returning with a pending
  // exception.  The pending exception check happened in the runtime
  // or native call stub.  The pending exception in Thread is
  // converted into a Java-level exception.
  //
  // Contract with Java-level exception handlers:
  // r0: exception
  // r3: throwing pc
  //
  // NOTE: At entry of this stub, exception-pc must be in LR !!

  // NOTE: this is always used as a jump target within generated code
  // so it just needs to be generated code with no x86 prolog

  address generate_forward_exception() {
    StubCodeMark mark(this, "StubRoutines", "forward exception");
    address start = __ pc();

    // Upon entry, LR points to the return address returning into
    // Java (interpreted or compiled) code; i.e., the return address
    // becomes the throwing pc.
    //
    // Arguments pushed before the runtime call are still on the stack
    // but the exception handler will reset the stack pointer ->
    // ignore them.  A potential result in registers can be ignored as
    // well.

#ifdef ASSERT
    // make sure this code is only executed if there is a pending exception
    {
      Label L;
      __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
      __ cbnz(rscratch1, L);
      __ stop("StubRoutines::forward exception: no pending exception (1)");
      __ bind(L);
    }
#endif

    // compute exception handler into r19

    // call the VM to find the handler address associated with the
    // caller address. pass thread in r0 and caller pc (ret address)
    // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
    // the stack.
    __ mov(c_rarg1, lr);
    // lr will be trashed by the VM call so we move it to R19
    // (callee-saved) because we also need to pass it to the handler
    // returned by this call.
    __ mov(r19, lr);
    BLOCK_COMMENT("call exception_handler_for_return_address");
    __ call_VM_leaf(CAST_FROM_FN_PTR(address,
                         SharedRuntime::exception_handler_for_return_address),
                    rthread, c_rarg1);
    // Reinitialize the ptrue predicate register, in case the external runtime
    // call clobbers ptrue reg, as we may return to SVE compiled code.
    __ reinitialize_ptrue();

    // we should not really care that lr is no longer the callee
    // address. we saved the value the handler needs in r19 so we can
    // just copy it to r3. however, the C2 handler will push its own
    // frame and then calls into the VM and the VM code asserts that
    // the PC for the frame above the handler belongs to a compiled
    // Java method. So, we restore lr here to satisfy that assert.
    __ mov(lr, r19);
    // setup r0 & r3 & clear pending exception
    __ mov(r3, r19);
    __ mov(r19, r0);
    __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
    __ str(zr, Address(rthread, Thread::pending_exception_offset()));

#ifdef ASSERT
    // make sure exception is set
    {
      Label L;
      __ cbnz(r0, L);
      __ stop("StubRoutines::forward exception: no pending exception (2)");
      __ bind(L);
    }
#endif

    // continue at exception handler
    // r0: exception
    // r3: throwing pc
    // r19: exception handler
    __ verify_oop(r0);
    __ br(r19);

    return start;
  }

  // Non-destructive plausibility checks for oops
  //
  // Arguments:
  //    r0: oop to verify
  //    rscratch1: error message
  //
  // Stack after saving c_rarg3:
  //    [tos + 0]: saved c_rarg3
  //    [tos + 1]: saved c_rarg2
  //    [tos + 2]: saved lr
  //    [tos + 3]: saved rscratch2
  //    [tos + 4]: saved r0
  //    [tos + 5]: saved rscratch1
  address generate_verify_oop() {

    StubCodeMark mark(this, "StubRoutines", "verify_oop");
    address start = __ pc();

    Label exit, error;

    // save c_rarg2 and c_rarg3
    __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));

    // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
    __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
    __ ldr(c_rarg3, Address(c_rarg2));
    __ add(c_rarg3, c_rarg3, 1);
    __ str(c_rarg3, Address(c_rarg2));

    // object is in r0
    // make sure object is 'reasonable'
    __ cbz(r0, exit); // if obj is NULL it is OK

#if INCLUDE_ZGC
    if (UseZGC) {
      // Check if mask is good.
      // verifies that ZAddressBadMask & r0 == 0
      __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
      __ andr(c_rarg2, r0, c_rarg3);
      __ cbnz(c_rarg2, error);
    }
#endif

    // Check if the oop is in the right area of memory
    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
    __ andr(c_rarg2, r0, c_rarg3);
    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());

    // Compare c_rarg2 and c_rarg3.  We don't use a compare
    // instruction here because the flags register is live.
    __ eor(c_rarg2, c_rarg2, c_rarg3);
    __ cbnz(c_rarg2, error);

    // make sure klass is 'reasonable', which is not zero.
    __ load_klass(r0, r0);  // get klass
    __ cbz(r0, error);      // if klass is NULL it is broken

    // return if everything seems ok
    __ bind(exit);

    __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
    __ ret(lr);

    // handle errors
    __ bind(error);
    __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));

    __ push(RegSet::range(r0, r29), sp);
    // debug(char* msg, int64_t pc, int64_t regs[])
    __ mov(c_rarg0, rscratch1);      // pass address of error message
    __ mov(c_rarg1, lr);             // pass return address
    __ mov(c_rarg2, sp);             // pass address of regs on stack
#ifndef PRODUCT
    assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
#endif
    BLOCK_COMMENT("call MacroAssembler::debug");
    __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
    __ blr(rscratch1);
    __ hlt(0);

    return start;
  }

  // Generate indices for iota vector.
  address generate_iota_indices(const char *stub_name) {
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", stub_name);
    address start = __ pc();
    // B
    __ emit_data64(0x0706050403020100, relocInfo::none);
    __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
    // H
    __ emit_data64(0x0003000200010000, relocInfo::none);
    __ emit_data64(0x0007000600050004, relocInfo::none);
    // S
    __ emit_data64(0x0000000100000000, relocInfo::none);
    __ emit_data64(0x0000000300000002, relocInfo::none);
    // D
    __ emit_data64(0x0000000000000000, relocInfo::none);
    __ emit_data64(0x0000000000000001, relocInfo::none);
    // S - FP
    __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
    __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
    // D - FP
    __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
    __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
    return start;
  }

  // The inner part of zero_words().  This is the bulk operation,
  // zeroing words in blocks, possibly using DC ZVA to do it.  The
  // caller is responsible for zeroing the last few words.
  //
  // Inputs:
  // r10: the HeapWord-aligned base address of an array to zero.
  // r11: the count in HeapWords, r11 > 0.
  //
  // Returns r10 and r11, adjusted for the caller to clear.
  // r10: the base address of the tail of words left to clear.
  // r11: the number of words in the tail.
  //      r11 < MacroAssembler::zero_words_block_size.

  address generate_zero_blocks() {
    Label done;
    Label base_aligned;

    Register base = r10, cnt = r11;

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", "zero_blocks");
    address start = __ pc();

    if (UseBlockZeroing) {
      int zva_length = VM_Version::zva_length();

      // Ensure ZVA length can be divided by 16. This is required by
      // the subsequent operations.
      assert (zva_length % 16 == 0, "Unexpected ZVA Length");

      __ tbz(base, 3, base_aligned);
      __ str(zr, Address(__ post(base, 8)));
      __ sub(cnt, cnt, 1);
      __ bind(base_aligned);

      // Ensure count >= zva_length * 2 so that it still deserves a zva after
      // alignment.
      Label small;
      int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
      __ subs(rscratch1, cnt, low_limit >> 3);
      __ br(Assembler::LT, small);
      __ zero_dcache_blocks(base, cnt);
      __ bind(small);
    }

    {
      // Number of stp instructions we'll unroll
      const int unroll =
        MacroAssembler::zero_words_block_size / 2;
      // Clear the remaining blocks.
      Label loop;
      __ subs(cnt, cnt, unroll * 2);
      __ br(Assembler::LT, done);
      __ bind(loop);
      for (int i = 0; i < unroll; i++)
        __ stp(zr, zr, __ post(base, 16));
      __ subs(cnt, cnt, unroll * 2);
      __ br(Assembler::GE, loop);
      __ bind(done);
      __ add(cnt, cnt, unroll * 2);
    }

    __ ret(lr);

    return start;
  }

  typedef enum {
    copy_forwards = 1,
    copy_backwards = -1
  } copy_direction;

  // Bulk copy of blocks of 8 words.
  //
  // count is a count of words.
  //
  // Precondition: count >= 8
  //
  // Postconditions:
  //
  // The least significant bit of count contains the remaining count
  // of words to copy.  The rest of count is trash.
  //
  // s and d are adjusted to point to the remaining words to copy
  //
  void generate_copy_longs(Label &start, Register s, Register d, Register count,
                           copy_direction direction) {
    int unit = wordSize * direction;
    int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;

    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
      t4 = r7, t5 = r10, t6 = r11, t7 = r12;
    const Register stride = r13;

    assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
    assert_different_registers(s, d, count, rscratch1);

    Label again, drain;
    const char *stub_name;
    if (direction == copy_forwards)
      stub_name = "forward_copy_longs";
    else
      stub_name = "backward_copy_longs";

    __ align(CodeEntryAlignment);

    StubCodeMark mark(this, "StubRoutines", stub_name);

    __ bind(start);

    Label unaligned_copy_long;
    if (AvoidUnalignedAccesses) {
      __ tbnz(d, 3, unaligned_copy_long);
    }

    if (direction == copy_forwards) {
      __ sub(s, s, bias);
      __ sub(d, d, bias);
    }

#ifdef ASSERT
    // Make sure we are never given < 8 words
    {
      Label L;
      __ cmp(count, (u1)8);
      __ br(Assembler::GE, L);
      __ stop("genrate_copy_longs called with < 8 words");
      __ bind(L);
    }
#endif

    // Fill 8 registers
    if (UseSIMDForMemoryOps) {
      __ ldpq(v0, v1, Address(s, 4 * unit));
      __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
    } else {
      __ ldp(t0, t1, Address(s, 2 * unit));
      __ ldp(t2, t3, Address(s, 4 * unit));
      __ ldp(t4, t5, Address(s, 6 * unit));
      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
    }

    __ subs(count, count, 16);
    __ br(Assembler::LO, drain);

    int prefetch = PrefetchCopyIntervalInBytes;
    bool use_stride = false;
    if (direction == copy_backwards) {
       use_stride = prefetch > 256;
       prefetch = -prefetch;
       if (use_stride) __ mov(stride, prefetch);
    }

    __ bind(again);

    if (PrefetchCopyIntervalInBytes > 0)
      __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);

    if (UseSIMDForMemoryOps) {
      __ stpq(v0, v1, Address(d, 4 * unit));
      __ ldpq(v0, v1, Address(s, 4 * unit));
      __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
      __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
    } else {
      __ stp(t0, t1, Address(d, 2 * unit));
      __ ldp(t0, t1, Address(s, 2 * unit));
      __ stp(t2, t3, Address(d, 4 * unit));
      __ ldp(t2, t3, Address(s, 4 * unit));
      __ stp(t4, t5, Address(d, 6 * unit));
      __ ldp(t4, t5, Address(s, 6 * unit));
      __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
    }

    __ subs(count, count, 8);
    __ br(Assembler::HS, again);

    // Drain
    __ bind(drain);
    if (UseSIMDForMemoryOps) {
      __ stpq(v0, v1, Address(d, 4 * unit));
      __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
    } else {
      __ stp(t0, t1, Address(d, 2 * unit));
      __ stp(t2, t3, Address(d, 4 * unit));
      __ stp(t4, t5, Address(d, 6 * unit));
      __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
    }

    {
      Label L1, L2;
      __ tbz(count, exact_log2(4), L1);
      if (UseSIMDForMemoryOps) {
        __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
        __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
      } else {
        __ ldp(t0, t1, Address(s, 2 * unit));
        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
        __ stp(t0, t1, Address(d, 2 * unit));
        __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
      }
      __ bind(L1);

      if (direction == copy_forwards) {
        __ add(s, s, bias);
        __ add(d, d, bias);
      }

      __ tbz(count, 1, L2);
      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
      __ bind(L2);
    }

    __ ret(lr);

    if (AvoidUnalignedAccesses) {
      Label drain, again;
      // Register order for storing. Order is different for backward copy.

      __ bind(unaligned_copy_long);

      // source address is even aligned, target odd aligned
      //
      // when forward copying word pairs we read long pairs at offsets
      // {0, 2, 4, 6} (in long words). when backwards copying we read
      // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
      // address by -2 in the forwards case so we can compute the
      // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
      // or -1.
      //
      // when forward copying we need to store 1 word, 3 pairs and
      // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
      // zero offset We adjust the destination by -1 which means we
      // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
      //
      // When backwards copyng we need to store 1 word, 3 pairs and
      // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
      // offsets {1, 3, 5, 7, 8} * unit.

      if (direction == copy_forwards) {
        __ sub(s, s, 16);
        __ sub(d, d, 8);
      }

      // Fill 8 registers
      //
      // for forwards copy s was offset by -16 from the original input
      // value of s so the register contents are at these offsets
      // relative to the 64 bit block addressed by that original input
      // and so on for each successive 64 byte block when s is updated
      //
      // t0 at offset 0,  t1 at offset 8
      // t2 at offset 16, t3 at offset 24
      // t4 at offset 32, t5 at offset 40
      // t6 at offset 48, t7 at offset 56

      // for backwards copy s was not offset so the register contents
      // are at these offsets into the preceding 64 byte block
      // relative to that original input and so on for each successive
      // preceding 64 byte block when s is updated. this explains the
      // slightly counter-intuitive looking pattern of register usage
      // in the stp instructions for backwards copy.
      //
      // t0 at offset -16, t1 at offset -8
      // t2 at offset -32, t3 at offset -24
      // t4 at offset -48, t5 at offset -40
      // t6 at offset -64, t7 at offset -56

      __ ldp(t0, t1, Address(s, 2 * unit));
      __ ldp(t2, t3, Address(s, 4 * unit));
      __ ldp(t4, t5, Address(s, 6 * unit));
      __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));

      __ subs(count, count, 16);
      __ br(Assembler::LO, drain);

      int prefetch = PrefetchCopyIntervalInBytes;
      bool use_stride = false;
      if (direction == copy_backwards) {
         use_stride = prefetch > 256;
         prefetch = -prefetch;
         if (use_stride) __ mov(stride, prefetch);
      }

      __ bind(again);

      if (PrefetchCopyIntervalInBytes > 0)
        __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);

      if (direction == copy_forwards) {
       // allowing for the offset of -8 the store instructions place
       // registers into the target 64 bit block at the following
       // offsets
       //
       // t0 at offset 0
       // t1 at offset 8,  t2 at offset 16
       // t3 at offset 24, t4 at offset 32
       // t5 at offset 40, t6 at offset 48
       // t7 at offset 56

        __ str(t0, Address(d, 1 * unit));
        __ stp(t1, t2, Address(d, 2 * unit));
        __ ldp(t0, t1, Address(s, 2 * unit));
        __ stp(t3, t4, Address(d, 4 * unit));
        __ ldp(t2, t3, Address(s, 4 * unit));
        __ stp(t5, t6, Address(d, 6 * unit));
        __ ldp(t4, t5, Address(s, 6 * unit));
        __ str(t7, Address(__ pre(d, 8 * unit)));
        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
      } else {
       // d was not offset when we started so the registers are
       // written into the 64 bit block preceding d with the following
       // offsets
       //
       // t1 at offset -8
       // t3 at offset -24, t0 at offset -16
       // t5 at offset -48, t2 at offset -32
       // t7 at offset -56, t4 at offset -48
       //                   t6 at offset -64
       //
       // note that this matches the offsets previously noted for the
       // loads

        __ str(t1, Address(d, 1 * unit));
        __ stp(t3, t0, Address(d, 3 * unit));
        __ ldp(t0, t1, Address(s, 2 * unit));
        __ stp(t5, t2, Address(d, 5 * unit));
        __ ldp(t2, t3, Address(s, 4 * unit));
        __ stp(t7, t4, Address(d, 7 * unit));
        __ ldp(t4, t5, Address(s, 6 * unit));
        __ str(t6, Address(__ pre(d, 8 * unit)));
        __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
      }

      __ subs(count, count, 8);
      __ br(Assembler::HS, again);

      // Drain
      //
      // this uses the same pattern of offsets and register arguments
      // as above
      __ bind(drain);
      if (direction == copy_forwards) {
        __ str(t0, Address(d, 1 * unit));
        __ stp(t1, t2, Address(d, 2 * unit));
        __ stp(t3, t4, Address(d, 4 * unit));
        __ stp(t5, t6, Address(d, 6 * unit));
        __ str(t7, Address(__ pre(d, 8 * unit)));
      } else {
        __ str(t1, Address(d, 1 * unit));
        __ stp(t3, t0, Address(d, 3 * unit));
        __ stp(t5, t2, Address(d, 5 * unit));
        __ stp(t7, t4, Address(d, 7 * unit));
        __ str(t6, Address(__ pre(d, 8 * unit)));
      }
      // now we need to copy any remaining part block which may
      // include a 4 word block subblock and/or a 2 word subblock.
      // bits 2 and 1 in the count are the tell-tale for whether we
      // have each such subblock
      {
        Label L1, L2;
        __ tbz(count, exact_log2(4), L1);
       // this is the same as above but copying only 4 longs hence
       // with only one intervening stp between the str instructions
       // but note that the offsets and registers still follow the
       // same pattern
        __ ldp(t0, t1, Address(s, 2 * unit));
        __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
        if (direction == copy_forwards) {
          __ str(t0, Address(d, 1 * unit));
          __ stp(t1, t2, Address(d, 2 * unit));
          __ str(t3, Address(__ pre(d, 4 * unit)));
        } else {
          __ str(t1, Address(d, 1 * unit));
          __ stp(t3, t0, Address(d, 3 * unit));
          __ str(t2, Address(__ pre(d, 4 * unit)));
        }
        __ bind(L1);

        __ tbz(count, 1, L2);
       // this is the same as above but copying only 2 longs hence
       // there is no intervening stp between the str instructions
       // but note that the offset and register patterns are still
       // the same
        __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
        if (direction == copy_forwards) {
          __ str(t0, Address(d, 1 * unit));
          __ str(t1, Address(__ pre(d, 2 * unit)));
        } else {
          __ str(t1, Address(d, 1 * unit));
          __ str(t0, Address(__ pre(d, 2 * unit)));
        }
        __ bind(L2);

       // for forwards copy we need to re-adjust the offsets we
       // applied so that s and d are follow the last words written

       if (direction == copy_forwards) {
         __ add(s, s, 16);
         __ add(d, d, 8);
       }

      }

      __ ret(lr);
      }
  }

  // Small copy: less than 16 bytes.
  //
  // NB: Ignores all of the bits of count which represent more than 15
  // bytes, so a caller doesn't have to mask them.

  void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
    bool is_backwards = step < 0;
    size_t granularity = uabs(step);
    int direction = is_backwards ? -1 : 1;
    int unit = wordSize * direction;

    Label Lword, Lint, Lshort, Lbyte;

    assert(granularity
           && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");

    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;

    // ??? I don't know if this bit-test-and-branch is the right thing
    // to do.  It does a lot of jumping, resulting in several
    // mispredicted branches.  It might make more sense to do this
    // with something like Duff's device with a single computed branch.

    __ tbz(count, 3 - exact_log2(granularity), Lword);
    __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
    __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
    __ bind(Lword);

    if (granularity <= sizeof (jint)) {
      __ tbz(count, 2 - exact_log2(granularity), Lint);
      __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
      __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
      __ bind(Lint);
    }

    if (granularity <= sizeof (jshort)) {
      __ tbz(count, 1 - exact_log2(granularity), Lshort);
      __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
      __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
      __ bind(Lshort);
    }

    if (granularity <= sizeof (jbyte)) {
      __ tbz(count, 0, Lbyte);
      __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
      __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
      __ bind(Lbyte);
    }
  }

  Label copy_f, copy_b;

  // All-singing all-dancing memory copy.
  //
  // Copy count units of memory from s to d.  The size of a unit is
  // step, which can be positive or negative depending on the direction
  // of copy.  If is_aligned is false, we align the source address.
  //

  void copy_memory(bool is_aligned, Register s, Register d,
                   Register count, Register tmp, int step) {
    copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
    bool is_backwards = step < 0;
    unsigned int granularity = uabs(step);
    const Register t0 = r3, t1 = r4;

    // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
    // load all the data before writing anything
    Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
    const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
    const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
    const Register send = r17, dend = r16;

    if (PrefetchCopyIntervalInBytes > 0)
      __ prfm(Address(s, 0), PLDL1KEEP);
    __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
    __ br(Assembler::HI, copy_big);

    __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
    __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));

    __ cmp(count, u1(16/granularity));
    __ br(Assembler::LS, copy16);

    __ cmp(count, u1(64/granularity));
    __ br(Assembler::HI, copy80);

    __ cmp(count, u1(32/granularity));
    __ br(Assembler::LS, copy32);

    // 33..64 bytes
    if (UseSIMDForMemoryOps) {
      __ ldpq(v0, v1, Address(s, 0));
      __ ldpq(v2, v3, Address(send, -32));
      __ stpq(v0, v1, Address(d, 0));
      __ stpq(v2, v3, Address(dend, -32));
    } else {
      __ ldp(t0, t1, Address(s, 0));
      __ ldp(t2, t3, Address(s, 16));
      __ ldp(t4, t5, Address(send, -32));
      __ ldp(t6, t7, Address(send, -16));

      __ stp(t0, t1, Address(d, 0));
      __ stp(t2, t3, Address(d, 16));
      __ stp(t4, t5, Address(dend, -32));
      __ stp(t6, t7, Address(dend, -16));
    }
    __ b(finish);

    // 17..32 bytes
    __ bind(copy32);
    __ ldp(t0, t1, Address(s, 0));
    __ ldp(t2, t3, Address(send, -16));
    __ stp(t0, t1, Address(d, 0));
    __ stp(t2, t3, Address(dend, -16));
    __ b(finish);

    // 65..80/96 bytes
    // (96 bytes if SIMD because we do 32 byes per instruction)
    __ bind(copy80);
    if (UseSIMDForMemoryOps) {
      __ ldpq(v0, v1, Address(s, 0));
      __ ldpq(v2, v3, Address(s, 32));
      // Unaligned pointers can be an issue for copying.
      // The issue has more chances to happen when granularity of data is
      // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
      // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
      // The most performance drop has been seen for the range 65-80 bytes.
      // For such cases using the pair of ldp/stp instead of the third pair of
      // ldpq/stpq fixes the performance issue.
      if (granularity < sizeof (jint)) {
        Label copy96;
        __ cmp(count, u1(80/granularity));
        __ br(Assembler::HI, copy96);
        __ ldp(t0, t1, Address(send, -16));

        __ stpq(v0, v1, Address(d, 0));
        __ stpq(v2, v3, Address(d, 32));
        __ stp(t0, t1, Address(dend, -16));
        __ b(finish);

        __ bind(copy96);
      }
      __ ldpq(v4, v5, Address(send, -32));

      __ stpq(v0, v1, Address(d, 0));
      __ stpq(v2, v3, Address(d, 32));
      __ stpq(v4, v5, Address(dend, -32));
    } else {
      __ ldp(t0, t1, Address(s, 0));
      __ ldp(t2, t3, Address(s, 16));
      __ ldp(t4, t5, Address(s, 32));
      __ ldp(t6, t7, Address(s, 48));
      __ ldp(t8, t9, Address(send, -16));

      __ stp(t0, t1, Address(d, 0));
      __ stp(t2, t3, Address(d, 16));
      __ stp(t4, t5, Address(d, 32));
      __ stp(t6, t7, Address(d, 48));
      __ stp(t8, t9, Address(dend, -16));
    }
    __ b(finish);

    // 0..16 bytes
    __ bind(copy16);
    __ cmp(count, u1(8/granularity));
    __ br(Assembler::LO, copy8);

    // 8..16 bytes
    __ ldr(t0, Address(s, 0));
    __ ldr(t1, Address(send, -8));
    __ str(t0, Address(d, 0));
    __ str(t1, Address(dend, -8));
    __ b(finish);

    if (granularity < 8) {
      // 4..7 bytes
      __ bind(copy8);
      __ tbz(count, 2 - exact_log2(granularity), copy4);
      __ ldrw(t0, Address(s, 0));
      __ ldrw(t1, Address(send, -4));
      __ strw(t0, Address(d, 0));
      __ strw(t1, Address(dend, -4));
      __ b(finish);
      if (granularity < 4) {
        // 0..3 bytes
        __ bind(copy4);
        __ cbz(count, finish); // get rid of 0 case
        if (granularity == 2) {
          __ ldrh(t0, Address(s, 0));
          __ strh(t0, Address(d, 0));
        } else { // granularity == 1
          // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
          // the first and last byte.
          // Handle the 3 byte case by loading and storing base + count/2
          // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
          // This does means in the 1 byte case we load/store the same
          // byte 3 times.
          __ lsr(count, count, 1);
          __ ldrb(t0, Address(s, 0));
          __ ldrb(t1, Address(send, -1));
          __ ldrb(t2, Address(s, count));
          __ strb(t0, Address(d, 0));
          __ strb(t1, Address(dend, -1));
          __ strb(t2, Address(d, count));
        }
        __ b(finish);
      }
    }

    __ bind(copy_big);
    if (is_backwards) {
      __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
      __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
    }

    // Now we've got the small case out of the way we can align the
    // source address on a 2-word boundary.

    Label aligned;

    if (is_aligned) {
      // We may have to adjust by 1 word to get s 2-word-aligned.
      __ tbz(s, exact_log2(wordSize), aligned);
      __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
      __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
      __ sub(count, count, wordSize/granularity);
    } else {
      if (is_backwards) {
        __ andr(rscratch2, s, 2 * wordSize - 1);
      } else {
        __ neg(rscratch2, s);
        __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
      }
      // rscratch2 is the byte adjustment needed to align s.
      __ cbz(rscratch2, aligned);
      int shift = exact_log2(granularity);
      if (shift)  __ lsr(rscratch2, rscratch2, shift);
      __ sub(count, count, rscratch2);

#if 0
      // ?? This code is only correct for a disjoint copy.  It may or
      // may not make sense to use it in that case.

      // Copy the first pair; s and d may not be aligned.
      __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
      __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));

      // Align s and d, adjust count
      if (is_backwards) {
        __ sub(s, s, rscratch2);
        __ sub(d, d, rscratch2);
      } else {
        __ add(s, s, rscratch2);
        __ add(d, d, rscratch2);
      }
#else
      copy_memory_small(s, d, rscratch2, rscratch1, step);
#endif
    }

    __ bind(aligned);

    // s is now 2-word-aligned.

    // We have a count of units and some trailing bytes.  Adjust the
    // count and do a bulk copy of words.
    __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
    if (direction == copy_forwards)
      __ bl(copy_f);
    else
      __ bl(copy_b);

    // And the tail.
    copy_memory_small(s, d, count, tmp, step);

    if (granularity >= 8) __ bind(copy8);
    if (granularity >= 4) __ bind(copy4);
    __ bind(finish);
  }

  void clobber_registers() {
#ifdef ASSERT
    RegSet clobbered
      = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
    __ mov(rscratch1, (uint64_t)0xdeadbeef);
    __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
    for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
      __ mov(*it, rscratch1);
    }
#endif

  }

  // Scan over array at a for count oops, verifying each one.
  // Preserves a and count, clobbers rscratch1 and rscratch2.
  void verify_oop_array (int size, Register a, Register count, Register temp) {
    Label loop, end;
    __ mov(rscratch1, a);
    __ mov(rscratch2, zr);
    __ bind(loop);
    __ cmp(rscratch2, count);
    __ br(Assembler::HS, end);
    if (size == wordSize) {
      __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
      __ verify_oop(temp);
    } else {
      __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
      __ decode_heap_oop(temp); // calls verify_oop
    }
    __ add(rscratch2, rscratch2, 1);
    __ b(loop);
    __ bind(end);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   is_oop  - true => oop array, so generate store check code
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
  // Side Effects:
  //   disjoint_int_copy_entry is set to the no-overlap entry point
  //   used by generate_conjoint_int_oop_copy().
  //
  address generate_disjoint_copy(int size, bool aligned, bool is_oop, address *entry,
                                  const char *name, bool dest_uninitialized = false) {
    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
    RegSet saved_reg = RegSet::of(s, d, count);
    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    __ enter();

    if (entry != NULL) {
      *entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }

    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
    if (dest_uninitialized) {
      decorators |= IS_DEST_UNINITIALIZED;
    }
    if (aligned) {
      decorators |= ARRAYCOPY_ALIGNED;
    }

    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
    bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);

    if (is_oop) {
      // save regs before copy_memory
      __ push(RegSet::of(d, count), sp);
    }
    {
      // UnsafeCopyMemory page error: continue after ucm
      bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
      UnsafeCopyMemoryMark ucmm(this, add_entry, true);
      copy_memory(aligned, s, d, count, rscratch1, size);
    }

    if (is_oop) {
      __ pop(RegSet::of(d, count), sp);
      if (VerifyOops)
        verify_oop_array(size, d, count, r16);
    }

    bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());

    __ leave();
    __ mov(r0, zr); // return 0
    __ ret(lr);
    return start;
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   is_oop  - true => oop array, so generate store check code
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
  address generate_conjoint_copy(int size, bool aligned, bool is_oop, address nooverlap_target,
                                 address *entry, const char *name,
                                 bool dest_uninitialized = false) {
    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
    RegSet saved_regs = RegSet::of(s, d, count);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();
    __ enter();

    if (entry != NULL) {
      *entry = __ pc();
      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
      BLOCK_COMMENT("Entry:");
    }

    // use fwd copy when (d-s) above_equal (count*size)
    __ sub(rscratch1, d, s);
    __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
    __ br(Assembler::HS, nooverlap_target);

    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
    if (dest_uninitialized) {
      decorators |= IS_DEST_UNINITIALIZED;
    }
    if (aligned) {
      decorators |= ARRAYCOPY_ALIGNED;
    }

    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
    bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);

    if (is_oop) {
      // save regs before copy_memory
      __ push(RegSet::of(d, count), sp);
    }
    {
      // UnsafeCopyMemory page error: continue after ucm
      bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
      UnsafeCopyMemoryMark ucmm(this, add_entry, true);
      copy_memory(aligned, s, d, count, rscratch1, -size);
    }
    if (is_oop) {
      __ pop(RegSet::of(d, count), sp);
      if (VerifyOops)
        verify_oop_array(size, d, count, r16);
    }
    bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
    __ leave();
    __ mov(r0, zr); // return 0
    __ ret(lr);
    return start;
}

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
  // we let the hardware handle it.  The one to eight bytes within words,
  // dwords or qwords that span cache line boundaries will still be loaded
  // and stored atomically.
  //
  // Side Effects:
  //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
  // we let the hardware handle it.  The one to eight bytes within words,
  // dwords or qwords that span cache line boundaries will still be loaded
  // and stored atomically.
  //
  // Side Effects:
  //   disjoint_byte_copy_entry is set to the no-overlap entry point
  //   used by generate_conjoint_byte_copy().
  //
  address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
    const bool not_oop = false;
    return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
  // we let the hardware handle it.  The one to eight bytes within words,
  // dwords or qwords that span cache line boundaries will still be loaded
  // and stored atomically.
  //
  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
                                      address* entry, const char *name) {
    const bool not_oop = false;
    return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
  // let the hardware handle it.  The two or four words within dwords
  // or qwords that span cache line boundaries will still be loaded
  // and stored atomically.
  //
  // Side Effects:
  //   disjoint_short_copy_entry is set to the no-overlap entry point
  //   used by generate_conjoint_short_copy().
  //
  address generate_disjoint_short_copy(bool aligned,
                                       address* entry, const char *name) {
    const bool not_oop = false;
    return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
  // let the hardware handle it.  The two or four words within dwords
  // or qwords that span cache line boundaries will still be loaded
  // and stored atomically.
  //
  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
                                       address *entry, const char *name) {
    const bool not_oop = false;
    return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);

  }
  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
  // Side Effects:
  //   disjoint_int_copy_entry is set to the no-overlap entry point
  //   used by generate_conjoint_int_oop_copy().
  //
  address generate_disjoint_int_copy(bool aligned, address *entry,
                                         const char *name, bool dest_uninitialized = false) {
    const bool not_oop = false;
    return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as ssize_t, can be zero
  //
  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
  // the hardware handle it.  The two dwords within qwords that span
  // cache line boundaries will still be loaded and stored atomically.
  //
  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
                                     address *entry, const char *name,
                                     bool dest_uninitialized = false) {
    const bool not_oop = false;
    return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as size_t, can be zero
  //
  // Side Effects:
  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
  //
  address generate_disjoint_long_copy(bool aligned, address *entry,
                                          const char *name, bool dest_uninitialized = false) {
    const bool not_oop = false;
    return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as size_t, can be zero
  //
  address generate_conjoint_long_copy(bool aligned,
                                      address nooverlap_target, address *entry,
                                      const char *name, bool dest_uninitialized = false) {
    const bool not_oop = false;
    return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as size_t, can be zero
  //
  // Side Effects:
  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
  //
  address generate_disjoint_oop_copy(bool aligned, address *entry,
                                     const char *name, bool dest_uninitialized) {
    const bool is_oop = true;
    const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
    return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
  }

  // Arguments:
  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
  //             ignored
  //   name    - stub name string
  //
  // Inputs:
  //   c_rarg0   - source array address
  //   c_rarg1   - destination array address
  //   c_rarg2   - element count, treated as size_t, can be zero
  //
  address generate_conjoint_oop_copy(bool aligned,
                                     address nooverlap_target, address *entry,
                                     const char *name, bool dest_uninitialized) {
    const bool is_oop = true;
    const int size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
    return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
                                  name, dest_uninitialized);
  }

  // Helper for generating a dynamic type check.
  // Smashes rscratch1, rscratch2.
  void generate_type_check(Register sub_klass,
                           Register super_check_offset,
                           Register super_klass,
                           Label& L_success) {
    assert_different_registers(sub_klass, super_check_offset, super_klass);

    BLOCK_COMMENT("type_check:");

    Label L_miss;

    __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
                                     super_check_offset);
    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);

    // Fall through on failure!
    __ BIND(L_miss);
  }

  //
  //  Generate checkcasting array copy stub
  //
  //  Input:
  //    c_rarg0   - source array address
  //    c_rarg1   - destination array address
  //    c_rarg2   - element count, treated as ssize_t, can be zero
  //    c_rarg3   - size_t ckoff (super_check_offset)
  //    c_rarg4   - oop ckval (super_klass)
  //
  //  Output:
  //    r0 ==  0  -  success
  //    r0 == -1^K - failure, where K is partial transfer count
  //
  address generate_checkcast_copy(const char *name, address *entry,
                                  bool dest_uninitialized = false) {

    Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;

    // Input registers (after setup_arg_regs)
    const Register from        = c_rarg0;   // source array address
    const Register to          = c_rarg1;   // destination array address
    const Register count       = c_rarg2;   // elementscount
    const Register ckoff       = c_rarg3;   // super_check_offset
    const Register ckval       = c_rarg4;   // super_klass

    RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
    RegSet wb_post_saved_regs = RegSet::of(count);

    // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
    const Register copied_oop  = r22;       // actual oop copied
    const Register count_save  = r21;       // orig elementscount
    const Register start_to    = r20;       // destination array start address
    const Register r19_klass   = r19;       // oop._klass

    //---------------------------------------------------------------
    // Assembler stub will be used for this call to arraycopy
    // if the two arrays are subtypes of Object[] but the
    // destination array type is not equal to or a supertype
    // of the source type.  Each element must be separately
    // checked.

    assert_different_registers(from, to, count, ckoff, ckval, start_to,
                               copied_oop, r19_klass, count_save);

    __ align(CodeEntryAlignment);
    StubCodeMark mark(this, "StubRoutines", name);
    address start = __ pc();

    __ enter(); // required for proper stackwalking of RuntimeStub frame

#ifdef ASSERT
    // caller guarantees that the arrays really are different
    // otherwise, we would have to make conjoint checks
    { Label L;
      __ b(L);                  // conjoint check not yet implemented
      __ stop("checkcast_copy within a single array");
      __ bind(L);
    }
#endif //ASSERT

    // Caller of this entry point must set up the argument registers.
    if (entry != NULL) {
      *entry = __ pc();
      BLOCK_COMMENT("Entry:");
    }

     // Empty array:  Nothing to do.
    __ cbz(count, L_done);
    __ push(RegSet::of(r19, r20, r21, r22), sp);

#ifdef ASSERT
    BLOCK_COMMENT("assert consistent ckoff/ckval");
    // The ckoff and ckval must be mutually consistent,
    // even though caller generates both.
    { Label L;
      int sco_offset = in_bytes(Klass::super_check_offset_offset());
      __ ldrw(start_to, Address(ckval, sco_offset));
      __ cmpw(ckoff, start_to);
      __ br(Assembler::EQ, L);
      __ stop("super_check_offset inconsistent");
      __ bind(L);
    }
#endif //ASSERT

    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
    bool is_oop = true;
    if (dest_uninitialized) {
      decorators |= IS_DEST_UNINITIALIZED;
    }

    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
    bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);

    // save the original count
    __ mov(count_save, count);

    // Copy from low to high addresses
    __ mov(start_to, to);              // Save destination array start address
    __ b(L_load_element);

    // ======== begin loop ========
    // (Loop is rotated; its entry is L_load_element.)
    // Loop control:
    //   for (; count != 0; count--) {
    //     copied_oop = load_heap_oop(from++);
    //     ... generate_type_check ...;
    //     store_heap_oop(to++, copied_oop);
    //   }
    __ align(OptoLoopAlignment);

    __ BIND(L_store_element);
    __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop
    __ sub(count, count, 1);
    __ cbz(count, L_do_card_marks);

    // ======== loop entry is here ========
    __ BIND(L_load_element);
    __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
    __ cbz(copied_oop, L_store_element);

    __ load_klass(r19_klass, copied_oop);// query the object klass
    generate_type_check(r19_klass, ckoff, ckval, L_store_element);
    // ======== end loop ========

    // It was a real error; we must depend on the caller to finish the job.
    // Register count = remaining oops, count_orig = total oops.
--> --------------------

--> maximum size reached

--> --------------------

quality96%

¤ Dauer der Verarbeitung: 0.21 Sekunden ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung ist noch experimentell.