/* * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2022 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// Declaration and definition of StubGenerator (no .hpp file). // For a more detailed description of the stub routine structure // see the comment in stubRoutines.hpp.
// Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later. // FIXME: why not simply use SP+frame::top_ijava_frame_size?
__ addi(r_top_of_arguments_addr,
R1_SP, frame::top_ijava_frame_abi_size);
__ add(r_top_of_arguments_addr,
r_top_of_arguments_addr, r_frame_alignment_in_bytes);
// any arguments to copy?
__ cmpdi(CCR0, r_arg_argument_count, 0);
__ beq(CCR0, arguments_copied);
// prepare loop and copy arguments in reverse order
{ // init CTR with arg_argument_count
__ mtctr(r_arg_argument_count);
// let r_argumentcopy_addr point to last outgoing Java arguments P
__ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
// let r_argument_addr point to last incoming java argument
__ add(r_argument_addr,
r_arg_argument_addr, r_argument_size_in_bytes);
__ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
// now loop while CTR > 0 and copy arguments
{
Label next_argument;
__ bind(next_argument);
// initialize call_stub locals (step 2) // now save tos as arguments_tos_address
__ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
// Set R15_prev_state to 0 for simplifying checks in callee.
__ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1); // Stack on entry to frame manager / native entry: // // F0 [TOP_IJAVA_FRAME_ABI] // alignment (optional) // [outgoing Java arguments] // [ENTRY_FRAME_LOCALS] // F1 [C_FRAME] // ... //
// global toc register
__ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R11_scratch1); // Remember the senderSP so we interpreter can pop c2i arguments off of the stack // when called via a c2i.
// Pass initial_caller_sp to framemanager.
__ mr(R21_sender_SP, R1_SP);
// Do a light-weight C-call here, r_new_arg_entry holds the address // of the interpreter entry point (frame manager or native entry) // and save runtime-value of LR in return_address.
assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread, "trashed r_new_arg_entry");
return_address = __ call_stub(r_new_arg_entry);
}
{
BLOCK_COMMENT("Returned from frame manager or native entry."); // Returned from frame manager or native entry. // Now pop frame, process result, and return to caller.
// Stack on exit from frame manager / native entry: // // F0 [ABI] // ... // [ENTRY_FRAME_LOCALS] // F1 [C_FRAME] // ... // // Just pop the topmost frame ... //
// Reload some volatile registers which we've spilled before the call // to frame manager / native entry. // Access all locals via frame pointer, because we know nothing about // the topmost frame's size.
__ ld(r_entryframe_fp, _abi0(callers_sp), R1_SP);
assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
__ ld(r_arg_result_addr,
_entry_frame_locals_neg(result_address), r_entryframe_fp);
__ ld(r_arg_result_type,
_entry_frame_locals_neg(result_type), r_entryframe_fp);
__ ld(r_cr, _abi0(cr), r_entryframe_fp);
__ ld(r_lr, _abi0(lr), r_entryframe_fp);
// pop frame and restore non-volatiles, LR and CR
__ mr(R1_SP, r_entryframe_fp);
__ pop_cont_fastpath();
__ mtcr(r_cr);
__ mtlr(r_lr);
// Store result depending on type. Everything that is not // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
__ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
__ cmpwi(CCR1, r_arg_result_type, T_LONG);
__ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
__ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
// case T_OBJECT:
__ bind(ret_is_object);
__ std(R3_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
// case T_LONG:
__ bind(ret_is_long);
__ std(R3_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
// case T_FLOAT:
__ bind(ret_is_float);
__ stfs(F1_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
// case T_DOUBLE:
__ bind(ret_is_double);
__ stfd(F1_RET, 0, r_arg_result_addr);
__ blr(); // return to caller
}
return start;
}
// Return point for a Java call if there's an exception thrown in // Java code. The exception is caught and transformed into a // pending exception stored in JavaThread that can be tested from // within the VM. //
address generate_catch_exception() {
StubCodeMark mark(this, "StubRoutines", "catch_exception");
__ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread); // store into `char *'
__ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread); // store into `int'
__ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
// complete return to VM
assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
__ mtlr(R4_ARG2); // continue in call stub
__ blr();
return start;
}
// Continuation point for runtime calls returning with a pending // exception. The pending exception check happened in the runtime // or native call stub. The pending exception in Thread is // converted into a Java-level exception. // // Read: // // LR: The pc the runtime library callee wants to return to. // Since the exception occurred in the callee, the return pc // from the point of view of Java is the exception pc. // thread: Needed for method handles. // // Invalidate: // // volatile registers (except below). // // Update: // // R4_ARG2: exception // // (LR is unchanged and is live out). //
address generate_forward_exception() {
StubCodeMark mark(this, "StubRoutines", "forward_exception");
address start = __ pc();
if (VerifyOops) { // Get pending exception oop.
__ ld(R3_ARG1,
in_bytes(Thread::pending_exception_offset()),
R16_thread); // Make sure that this code is only executed if there is a pending exception.
{
Label L;
__ cmpdi(CCR0, R3_ARG1, 0);
__ bne(CCR0, L);
__ stop("StubRoutines::forward exception: no pending exception (1)");
__ bind(L);
}
__ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
}
// Save LR/CR and copy exception pc (LR) into R4_ARG2.
__ save_LR_CR(R4_ARG2);
__ push_frame_reg_args(0, R0); // Find exception handler.
__ call_VM_leaf(CAST_FROM_FN_PTR(address,
SharedRuntime::exception_handler_for_return_address),
R16_thread,
R4_ARG2); // Copy handler's address.
__ mtctr(R3_RET);
__ pop_frame();
__ restore_LR_CR(R0);
// Set up the arguments for the exception handler: // - R3_ARG1: exception oop // - R4_ARG2: exception pc.
// The exception pc is the return address in the caller. // Must load it into R4_ARG2.
__ mflr(R4_ARG2);
#ifdef ASSERT // Make sure exception is set.
{
Label L;
__ cmpdi(CCR0, R3_ARG1, 0);
__ bne(CCR0, L);
__ stop("StubRoutines::forward exception: no pending exception (2)");
__ bind(L);
} #endif
// Clear the pending exception.
__ li(R0, 0);
__ std(R0,
in_bytes(Thread::pending_exception_offset()),
R16_thread); // Jump to exception handler.
__ bctr();
return start;
}
#undef __ #define __ masm-> // Continuation point for throwing of implicit exceptions that are // not handled in the current activation. Fabricates an exception // oop and initiates normal exception dispatching in this // frame. Only callee-saved registers are preserved (through the // normal register window / RegisterMap handling). If the compiler // needs all registers to be preserved between the fault point and // the exception handler then it must assume responsibility for that // in AbstractCompiler::continuation_for_implicit_null_exception or // continuation_for_implicit_division_by_zero_exception. All other // implicit exceptions (e.g., NullPointerException or // AbstractMethodError on entry) are either at call sites or // otherwise assume that stack unwinding will be initiated, so // caller saved registers were assumed volatile in the compiler. // // Note that we generate only this stub into a RuntimeStub, because // it needs to be properly traversed and ignored during GC, so we // change the meaning of the "__" macro within this method. // // Note: the routine set_pc_not_at_call_for_caller in // SharedRuntime.cpp requires that this code be generated into a // RuntimeStub.
address generate_throw_exception(constchar* name, address runtime_entry, bool restore_saved_exception_pc, Register arg1 = noreg, Register arg2 = noreg) {
CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
MacroAssembler* masm = new MacroAssembler(&code);
OopMapSet* oop_maps = new OopMapSet(); int frame_size_in_bytes = frame::abi_reg_args_size;
OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
address start = __ pc();
__ save_LR_CR(R11_scratch1);
// Push a frame.
__ push_frame_reg_args(0, R11_scratch1);
address frame_complete_pc = __ pc();
if (restore_saved_exception_pc) {
__ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc");
}
// Note that we always have a runtime stub frame on the top of // stack by this point. Remember the offset of the instruction // whose address will be moved to R11_scratch1.
address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
// Set an oopmap for the call site.
oop_maps->add_gc_map((int)(gc_map_pc - start), map);
__ reset_last_Java_frame();
#ifdef ASSERT // Make sure that this code is only executed if there is a pending // exception.
{
Label L;
__ ld(R0,
in_bytes(Thread::pending_exception_offset()),
R16_thread);
__ cmpdi(CCR0, R0, 0);
__ bne(CCR0, L);
__ stop("StubRoutines::throw_exception: no pending exception");
__ bind(L);
} #endif
// Procedure for large arrays (uses data cache block zero instruction).
Label dwloop, fast, fastloop, restloop, lastdword, done; int cl_size = VM_Version::L1_data_cache_line_size(); int cl_dwords = cl_size >> 3; int cl_dwordaddr_bits = exact_log2(cl_dwords); int min_dcbz = 2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
// Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
__ dcbtst(base_ptr_reg); // Indicate write access to first cache line ...
__ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if number of dwords is even.
__ srdi_(tmp1_reg, cnt_dwords_reg, 1); // number of double dwords
__ load_const_optimized(zero_reg, 0L); // Use as zero register.
__ cmpdi(CCR1, tmp2_reg, 0); // cnt_dwords even?
__ beq(CCR0, lastdword); // size <= 1
__ mtctr(tmp1_reg); // Speculatively preload counter for rest loop (>0).
__ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
__ neg(tmp1_reg, base_ptr_reg); // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
__ blt(CCR0, restloop); // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
__ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
__ beq(CCR0, fast); // already 128byte aligned
__ mtctr(tmp1_reg); // Set ctr to hit 128byte boundary (0<ctr<cnt).
__ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
// Clear in first cache line dword-by-dword if not already 128byte aligned.
__ bind(dwloop);
__ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block.
__ addi(base_ptr_reg, base_ptr_reg, 8);
__ bdnz(dwloop);
// clear 128byte blocks
__ bind(fast);
__ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
__ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if rest even
#if !defined(PRODUCT) // Wrapper which calls oopDesc::is_oop_or_null() // Only called by MacroAssembler::verify_oop staticvoid verify_oop_helper(constchar* message, oopDesc* o) { if (!oopDesc::is_oop_or_null(o)) {
fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
}
++ StubRoutines::_verify_oop_count;
} #endif
// Return address of code to be called from code generated by // MacroAssembler::verify_oop. // // Don't generate, rather use C++ code.
address generate_verify_oop() { // this is actually a `FunctionDescriptor*'.
address start = 0;
// -XX:+OptimizeFill : convert fill/copy loops into intrinsic // // The code is implemented(ported from sparc) as we believe it benefits JVM98, however // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all! // // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition // for turning on loop predication optimization, and hence the behavior of "array range check" // and "loop invariant check" could be influenced, which potentially boosted JVM98. // // Generate stub for disjoint short fill. If "aligned" is true, the // "to" address is assumed to be heapword aligned. // // Arguments for generated stub: // to: R3_ARG1 // value: R4_ARG2 // count: R5_ARG3 treated as signed //
address generate_fill(BasicType t, bool aligned, constchar* name) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
constRegister to = R3_ARG1; // source array address constRegister value = R4_ARG2; // fill value constRegister count = R5_ARG3; // elements count constRegister temp = R6_ARG4; // temp register
//assert_clean_int(count, O3); // Make sure 'count' is clean int.
int shift = -1; switch (t) { case T_BYTE:
shift = 2; // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
__ rldimi(value, value, 8, 48); // 8 bit -> 16 bit
__ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
__ blt(CCR0, L_fill_elements);
__ rldimi(value, value, 16, 32); // 16 bit -> 32 bit break; case T_SHORT:
shift = 1; // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
__ rldimi(value, value, 16, 32); // 16 bit -> 32 bit
__ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
__ blt(CCR0, L_fill_elements); break; case T_INT:
shift = 0;
__ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element.
__ blt(CCR0, L_fill_4_bytes); break; default: ShouldNotReachHere();
}
if (!aligned && (t == T_BYTE || t == T_SHORT)) { // Align source address at 4 bytes address boundary. if (t == T_BYTE) { // One byte misalignment happens only for byte arrays.
__ andi_(temp, to, 1);
__ beq(CCR0, L_skip_align1);
__ stb(value, 0, to);
__ addi(to, to, 1);
__ addi(count, count, -1);
__ bind(L_skip_align1);
} // Two bytes misalignment happens only for byte and short (char) arrays.
__ andi_(temp, to, 2);
__ beq(CCR0, L_skip_align2);
__ sth(value, 0, to);
__ addi(to, to, 2);
__ addi(count, count, -(1 << (shift - 1)));
__ bind(L_skip_align2);
}
if (!aligned) { // Align to 8 bytes, we know we are 4 byte aligned to start.
__ andi_(temp, to, 7);
__ beq(CCR0, L_fill_32_bytes);
__ stw(value, 0, to);
__ addi(to, to, 4);
__ addi(count, count, -(1 << shift));
__ bind(L_fill_32_bytes);
}
__ li(temp, 8<<shift); // Prepare for 32 byte loop. // Clone bytes int->long as above.
__ rldimi(value, value, 32, 0); // 32 bit -> 64 bit
// // Length is too short, just fill 8 bytes at a time. //
Label L_fill_8_bytes_loop;
__ bind(L_fill_8_bytes_loop);
__ std(value, 0, to);
__ addic_(count, count, -(2 << shift));
__ addi(to, to, 8);
__ bge(CCR0, L_fill_8_bytes_loop);
// Generate overlap test for array copy stubs. // // Input: // R3_ARG1 - from // R4_ARG2 - to // R5_ARG3 - element count // void array_overlap_test(address no_overlap_target, int log2_elem_size) { Register tmp1 = R6_ARG4; Register tmp2 = R7_ARG5;
assert_positive_int(R5_ARG3);
__ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
__ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
__ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
__ cmpld(CCR1, tmp1, tmp2);
__ crnand(CCR0, Assembler::less, CCR1, Assembler::less); // Overlaps if Src before dst and distance smaller than size. // Branch to forward copy routine otherwise (within range of 32kB).
__ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target);
// need to copy backwards
}
// This is common errorexit stub for UnsafeCopyMemory.
address generate_unsafecopy_common_error_exit() {
address start_pc = __ pc(); Register tmp1 = R6_ARG4; // probably copy stub would have changed value reset it. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp1, VM_Version::_dscr_val);
__ mtdscr(tmp1);
}
__ li(R3_RET, 0); // return 0
__ blr(); return start_pc;
}
// The guideline in the implementations of generate_disjoint_xxx_copy // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with // single instructions, but to avoid alignment interrupts (see subsequent // comment). Furthermore, we try to minimize misaligned access, even // though they cause no alignment interrupt. // // In Big-Endian mode, the PowerPC architecture requires implementations to // handle automatically misaligned integer halfword and word accesses, // word-aligned integer doubleword accesses, and word-aligned floating-point // accesses. Other accesses may or may not generate an Alignment interrupt // depending on the implementation. // Alignment interrupt handling may require on the order of hundreds of cycles, // so every effort should be made to avoid misaligned memory values. // // // Generate stub for disjoint byte copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed //
address generate_disjoint_byte_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
// Don't try anything fancy if arrays don't have many elements.
__ li(tmp3, 0);
__ cmpwi(CCR0, R5_ARG3, 17);
__ ble(CCR0, l_6); // copy 4 at a time
if (!aligned) {
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(tmp1, tmp1, 3);
__ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
// Copy elements if necessary to align to 4 bytes.
__ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
__ andi_(tmp1, tmp1, 3);
__ beq(CCR0, l_2);
// copy 8 elements at a time
__ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
__ andi_(tmp1, tmp2, 7);
__ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
// copy a 2-element word if necessary to align to 8 bytes
__ andi_(R0, R3_ARG1, 7);
__ beq(CCR0, l_7);
__ bind(l_8); // Use unrolled version for mass copying (copy 32 elements a time) // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both.
__ ld(tmp1, 0, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp4, 24, R3_ARG1);
__ std(tmp1, 0, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp4, 24, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_8);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as // loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_10); // Use loop with VSX load/store instructions to // copy 32 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_10); // Dec CTR and loop if not zero.
// Generate stub for conjoint byte copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed //
address generate_conjoint_byte_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
array_overlap_test(nooverlap_target, 0); // Do reverse copy. We assume the case of actual overlap is rare enough // that we don't have to optimize it.
Label l_1, l_2;
{ // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
UnsafeCopyMemoryMark ucmm(this, !aligned, false);
__ b(l_2);
__ bind(l_1);
__ stbx(tmp1, R4_ARG2, R5_ARG3);
__ bind(l_2);
__ addic_(R5_ARG3, R5_ARG3, -1);
__ lbzx(tmp1, R3_ARG1, R5_ARG3);
__ bge(CCR0, l_1);
}
__ li(R3_RET, 0); // return 0
__ blr();
return start;
}
// Generate stub for disjoint short copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // elm.count: R5_ARG3 treated as signed // // Strategy for aligned==true: // // If length <= 9: // 1. copy 2 elements at a time (l_6) // 2. copy last element if original element count was odd (l_1) // // If length > 9: // 1. copy 4 elements at a time until less than 4 elements are left (l_7) // 2. copy 2 elements at a time until less than 2 elements are left (l_6) // 3. copy last element if one was left in step 2. (l_1) // // // Strategy for aligned==false: // // If length <= 9: same as aligned==true case, but NOTE: load/stores // can be unaligned (see comment below) // // If length > 9: // 1. continue with step 6. if the alignment of from and to mod 4 // is different. // 2. align from and to to 4 bytes by copying 1 element if necessary // 3. at l_2 from and to are 4 byte aligned; continue with // 5. if they cannot be aligned to 8 bytes because they have // got different alignment mod 8. // 4. at this point we know that both, from and to, have the same // alignment mod 8, now copy one element if necessary to get // 8 byte alignment of from and to. // 5. copy 4 elements at a time until less than 4 elements are // left; depending on step 3. all load/stores are aligned or // either all loads or all stores are unaligned. // 6. copy 2 elements at a time until less than 2 elements are // left (l_6); arriving here from step 1., there is a chance // that all accesses are unaligned. // 7. copy last element if one was left in step 6. (l_1) // // There are unaligned data accesses using integer load/store // instructions in this stub. POWER allows such accesses. // // According to the manuals (PowerISA_V2.06_PUBLIC, Book II, // Chapter 2: Effect of Operand Placement on Performance) unaligned // integer load/stores have good performance. Only unaligned // floating point load/stores can have poor performance. // // TODO: // // 1. check if aligning the backbranch target of loops is beneficial //
address generate_disjoint_short_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
{ // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
UnsafeCopyMemoryMark ucmm(this, !aligned, false); // don't try anything fancy if arrays don't have many elements
__ li(tmp3, 0);
__ cmpwi(CCR0, R5_ARG3, 9);
__ ble(CCR0, l_6); // copy 2 at a time
if (!aligned) {
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(tmp1, tmp1, 3);
__ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
// At this point it is guaranteed that both, from and to have the same alignment mod 4.
// Copy 1 element if necessary to align to 4 bytes.
__ andi_(tmp1, R3_ARG1, 3);
__ beq(CCR0, l_2);
// At this point the positions of both, from and to, are at least 4 byte aligned.
// Copy 4 elements at a time. // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
__ xorr(tmp2, R3_ARG1, R4_ARG2);
__ andi_(tmp1, tmp2, 7);
__ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
// Copy a 2-element word if necessary to align to 8 bytes.
__ andi_(R0, R3_ARG1, 7);
__ beq(CCR0, l_7);
__ bind(l_8); // Use unrolled version for mass copying (copy 16 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both.
__ ld(tmp1, 0, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp4, 24, R3_ARG1);
__ std(tmp1, 0, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp4, 24, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_8);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch src data into L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. It's not aligned 16-byte // as loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_9); // Use loop with VSX load/store instructions to // copy 16 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src.
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst.
__ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16.
__ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32.
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32.
__ bdnz(l_9); // Dec CTR and loop if not zero.
// Generate stub for conjoint short copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed //
address generate_conjoint_short_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
// Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned" // is true, the "from" and "to" addresses are assumed to be heapword aligned. // // Arguments: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed // void generate_disjoint_int_copy_core(bool aligned) { Register tmp1 = R6_ARG4; Register tmp2 = R7_ARG5; Register tmp3 = R8_ARG6; Register tmp4 = R0;
// for short arrays, just do single element copy
__ li(tmp3, 0);
__ cmpwi(CCR0, R5_ARG3, 5);
__ ble(CCR0, l_2);
if (!aligned) { // check if arrays have same alignment mod 8.
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(R0, tmp1, 7); // Not the same alignment, but ld and std just need to be 4 byte aligned.
__ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
// copy 1 element to align to and from on an 8 byte boundary
__ andi_(R0, R3_ARG1, 7);
__ beq(CCR0, l_4);
__ bind(l_6); // Use unrolled version for mass copying (copy 8 elements a time). // Load feeding store gets zero latency on power6, however not on power 5. // Therefore, the following sequence is made for the good of both.
__ ld(tmp1, 0, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp4, 24, R3_ARG1);
__ std(tmp1, 0, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp4, 24, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_6);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as // loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_7); // Use loop with VSX load/store instructions to // copy 8 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_7); // Dec CTR and loop if not zero.
// Generate stub for disjoint int copy. If "aligned" is true, the // "from" and "to" addresses are assumed to be heapword aligned. // // Arguments for generated stub: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed //
address generate_disjoint_int_copy(bool aligned, constchar * name) {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ function_entry();
assert_positive_int(R5_ARG3);
{ // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
UnsafeCopyMemoryMark ucmm(this, !aligned, false);
generate_disjoint_int_copy_core(aligned);
}
__ li(R3_RET, 0); // return 0
__ blr(); return start;
}
// Generate core code for conjoint int copy (and oop copy on // 32-bit). If "aligned" is true, the "from" and "to" addresses // are assumed to be heapword aligned. // // Arguments: // from: R3_ARG1 // to: R4_ARG2 // count: R5_ARG3 treated as signed // void generate_conjoint_int_copy_core(bool aligned) { // Do reverse copy. We assume the case of actual overlap is rare enough // that we don't have to optimize it.
if (!aligned) { // check if arrays have same alignment mod 8.
__ xorr(tmp1, R3_ARG1, R4_ARG2);
__ andi_(R0, tmp1, 7); // Not the same alignment, but ld and std just need to be 4 byte aligned.
__ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
// copy 1 element to align to and from on an 8 byte boundary
__ andi_(R0, R3_ARG1, 7);
__ beq(CCR0, l_7);
if (!VM_Version::has_vsx()) {
__ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both.
__ addi(R3_ARG1, R3_ARG1, -32);
__ addi(R4_ARG2, R4_ARG2, -32);
__ ld(tmp4, 24, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp1, 0, R3_ARG1);
__ std(tmp4, 24, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp1, 0, R4_ARG2);
__ bdnz(l_4);
} else { // Processor supports VSX, so use it to mass copy. // Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as // loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_4); // Use loop with VSX load/store instructions to // copy 8 elements a time.
__ addi(R3_ARG1, R3_ARG1, -32); // Update src-=32
__ addi(R4_ARG2, R4_ARG2, -32); // Update dsc-=32
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src+16
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ bdnz(l_4);
if (!VM_Version::has_vsx()) {
__ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both.
__ ld(tmp1, 0, R3_ARG1);
__ ld(tmp2, 8, R3_ARG1);
__ ld(tmp3, 16, R3_ARG1);
__ ld(tmp4, 24, R3_ARG1);
__ std(tmp1, 0, R4_ARG2);
__ std(tmp2, 8, R4_ARG2);
__ std(tmp3, 16, R4_ARG2);
__ std(tmp4, 24, R4_ARG2);
__ addi(R3_ARG1, R3_ARG1, 32);
__ addi(R4_ARG2, R4_ARG2, 32);
__ bdnz(l_4);
} else { // Processor supports VSX, so use it to mass copy.
// Prefetch the data into the L2 cache.
__ dcbt(R3_ARG1, 0);
// If supported set DSCR pre-fetch to deepest. if (VM_Version::has_mfdscr()) {
__ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
__ mtdscr(tmp2);
}
__ li(tmp1, 16);
// Backbranch target aligned to 32-byte. Not 16-byte align as // loop contains < 8 instructions that fit inside a single // i-cache sector.
__ align(32);
__ bind(l_5); // Use loop with VSX load/store instructions to // copy 4 elements a time.
__ lxvd2x(tmp_vsr1, R3_ARG1); // Load src
__ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst
__ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16
__ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
__ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32
__ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32
__ bdnz(l_5); // Dec CTR and loop if not zero.
if (!VM_Version::has_vsx()) {
__ bind(l_4); // Use unrolled version for mass copying (copy 4 elements a time). // Load feeding store gets zero latency on Power6, however not on Power5. // Therefore, the following sequence is made for the good of both.
__ addi(R3_ARG1, R3_ARG1, -32);
--> --------------------
--> maximum size reached
--> --------------------
Messung V0.5
¤ Dauer der Verarbeitung: 0.30 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.