/* * Copyright (c) 2016, 2020, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016, 2020 SAP SE. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. *
*/
// HeapWordSize (the size of class HeapWord) is 8 Bytes (the size of a // pointer variable), since we always run the _LP64 model. As a consequence, // HeapWord* memory ranges are always assumed to be doubleword-aligned, // having a size which is an integer multiple of HeapWordSize. // // Dealing only with doubleword-aligned doubleword units has important // positive performance and data access consequences. Many of the move // instructions perform particularly well under these circumstances. // Data access is "doubleword-concurrent", except for MVC and XC. // Furthermore, data access can be forced to be sequential (MVCL and MVCLE) // by use of the special padding byte 0xb1, where required. For copying, // we use padding byte 0xb0 to prevent the D-cache from being polluted. // // On z/Architecture, gcc optimizes memcpy into a series of MVC instructions. // This is optimal, even if just one HeapWord is copied. However, MVC // copying is not atomic, i.e. not "doubleword concurrent" by definition. // // If the -mmvcle compiler option is specified, memcpy translates into // code such that the entire memory range is copied or preset with just // one MVCLE instruction. // // *to = *from is transformed into a MVC instruction already with -O1. // Thus, for atomic copy operations, (inline) assembler code is required // to guarantee atomic data accesses. // // For large (len >= MVCLEThreshold) chunks of memory, we exploit // special H/W support of z/Architecture: // 1) copy short piece of memory to page-align address(es) // 2) copy largest part (all contained full pages) of memory using mvcle instruction. // z/Architecture processors have special H/W support for page-aligned storage // where len is an int multiple of page size. In that case, up to 4 cache lines are // processed in parallel and L1 cache is not polluted. // 3) copy the remaining piece of memory. // // Measurement classifications: // very rare - <= 10.000 calls AND <= 1.000 usec elapsed // rare - <= 100.000 calls AND <= 10.000 usec elapsed // some - <= 1.000.000 calls AND <= 100.000 usec elapsed // freq - <= 10.000.000 calls AND <= 1.000.000 usec elapsed // very freq - > 10.000.000 calls OR > 1.000.000 usec elapsed
#undef USE_INLINE_ASM
staticvoid copy_conjoint_jshorts_atomic(const jshort* from, jshort* to, size_t count) { if (from > to) { while (count-- > 0) { // Copy forwards
*to++ = *from++;
}
} else {
from += count - 1;
to += count - 1; while (count-- > 0) { // Copy backwards
*to-- = *from--;
}
}
}
staticvoid copy_conjoint_jints_atomic(const jint* from, jint* to, size_t count) { if (from > to) { while (count-- > 0) { // Copy forwards
*to++ = *from++;
}
} else {
from += count - 1;
to += count - 1; while (count-- > 0) { // Copy backwards
*to-- = *from--;
}
}
}
// Optimized copying for data less than 4k // - no destructive overlap // - 0 <= _n_bytes <= 4096 // This macro needs to be gcc-compiled with -march=z990. Otherwise, the // LAY instruction is not available. #define MVC_MULTI(_to,_from,_n_bytes) \
{ unsignedlong toaddr; \ unsignedlong fromaddr; \ unsignedlong movetable; \ unsignedlong len; \ asm("\t" \ " LTGFR %[len],%[nby] \n\t" \ " LG %[ta],%[to] \n\t"/* address of to area */ \ " BRC 8,1f \n\t"/* nothing to copy */ \
\ " NILL %[nby],255 \n\t"/* # bytes mod 256 */ \ " LG %[fa],%[from] \n\t"/* address of from area */ \ " BRC 8,3f \n\t"/* no rest, skip copying */ \
\ " LARL %[mta],2f \n\t"/* MVC template addr */ \ " AHI %[nby],-1 \n\t"/* adjust for EX MVC */ \
\ " EX %[nby],0(%[mta]) \n\t"/* only rightmost */ \ /* 8 bits of nby used */ \ /* Since nby is <= 4096 on entry to this code, we do need */ \ /* no zero extension before using it in addr calc. */ \ " LA %[fa],1(%[nby],%[fa]) \n\t"/* adjust from addr */ \ " LA %[ta],1(%[nby],%[ta]) \n\t"/* adjust to addr */ \
\ "3: SRAG %[nby],%[len],8 \n\t"/* # cache lines */ \ " LARL %[mta],1f \n\t"/* MVC table begin */ \ " BRC 8,1f \n\t"/* nothing to copy */ \
\ /* Insert ASSERT code here if required. */ \
\
\ " LNGFR %[nby],%[nby] \n\t"/* negative offset into */ \ " SLLG %[nby],%[nby],4 \n\t"/* MVC table 16-byte blocks */ \ " BC 15,0(%[nby],%[mta]) \n\t"/* branch to block #ncl */ \
\ "2: MVC 0(1,%[ta]),0(%[fa]) \n\t"/* MVC template */ \
\ "4: MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 4096 == l */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ "5: MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 3840 <= l < 4096 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 3548 <= l < 3328 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 3328 <= l < 3328 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 3072 <= l < 3328 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 2816 <= l < 3072 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 2560 <= l < 2816 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 2304 <= l < 2560 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 2048 <= l < 2304 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 1792 <= l < 2048 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 1536 <= l < 1792 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 1280 <= l < 1536 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 1024 <= l < 1280 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 768 <= l < 1024 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 512 <= l < 768 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ " MVC 0(256,%[ta]),0(%[fa]) \n\t"/* 256 <= l < 512 */ \ " LAY %[ta],256(0,%[ta]) \n\t" \ " LA %[fa],256(0,%[fa]) \n\t" \ "1: BCR 0,0 \n\t"/* nop as branch target */ \
: [to] "+Q" (_to) /* outputs */ \
, [from] "+Q" (_from) \
, [ta] "=a" (toaddr) \
, [fa] "=a" (fromaddr) \
, [mta] "=a" (movetable) \
, [nby] "+a" (_n_bytes) \
, [len] "=a" (len) \
: \
: "cc"/* clobbered */ \
); \
}
#define MVCLE_MEMCOPY(_to,_from,_len) \ asm( \ " LG 0,%[to] \n\t"/* address of to area */ \ " LG 2,%[from] \n\t"/* address of from area */ \ " LGR 1,%[len] \n\t"/* len of to area */ \ " LGR 3,%[len] \n\t"/* len of from area */ \ "1: MVCLE 0,2,176 \n\t"/* copy storage, bypass cache (0xb0) */ \ " BRC 1,1b \n\t"/* retry if interrupted */ \
: [to] "+Q" (_to) /* outputs */ \
, [from] "+Q" (_from) /* outputs */ \
: [len] "r" (_len) /* inputs */ \
: "cc", "r0", "r1", "r2", "r3"/* clobbered */ \
);
#define MVCLE_MEMINIT(_to,_val,_len) \ asm( \ " LG 0,%[to] \n\t"/* address of to area */ \ " LGR 1,%[len] \n\t"/* len of to area */ \ " XGR 3,3 \n\t"/* from area len = 0 */ \ "1: MVCLE 0,2,0(%[val]) \n\t"/* init storage */ \ " BRC 1,1b \n\t"/* retry if interrupted */ \
: [to] "+Q" (_to) /* outputs */ \
: [len] "r" (_len) /* inputs */ \
, [val] "r" (_val) /* inputs */ \
: "cc", "r0", "r1", "r3"/* clobbered */ \
); #define MVCLE_MEMZERO(_to,_len) \ asm( \ " LG 0,%[to] \n\t"/* address of to area */ \ " LGR 1,%[len] \n\t"/* len of to area */ \ " XGR 3,3 \n\t"/* from area len = 0 */ \ "1: MVCLE 0,2,0 \n\t"/* clear storage */ \ " BRC 1,1b \n\t"/* retry if interrupted */ \
: [to] "+Q" (_to) /* outputs */ \
: [len] "r" (_len) /* inputs */ \
: "cc", "r0", "r1", "r3"/* clobbered */ \
);
// Clear a stretch of memory, 0 <= _len <= 256. // There is no alignment prereq. // There is no test for len out of range specified above. #define XC_MEMZERO_256(_to,_len) \
{ unsignedlong toaddr; unsignedlong tolen; \ unsignedlong target; \ asm("\t" \ " LTGR %[tolen],%[len] \n\t"/* decr for MVC */ \ " BRC 8,2f \n\t"/* do nothing for l=0*/ \ " AGHI %[tolen],-1 \n\t"/* adjust for EX XC */ \ " LARL %[target],1f \n\t"/* addr of XC instr */ \ " LG %[toaddr],%[to] \n\t"/* addr of data area */ \ " EX %[tolen],0(%[target]) \n\t"/* execute MVC instr */ \ " BRC 15,2f \n\t"/* skip template */ \ "1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ "2: BCR 0,0 \n\t"/* nop a branch target*/\
: [to] "+Q" (_to) /* outputs */ \
, [tolen] "=a" (tolen) \
, [toaddr] "=a" (toaddr) \
, [target] "=a" (target) \
: [len] "r" (_len) /* inputs */ \
: "cc"/* clobbered */ \
); \
}
// Clear a stretch of memory, 256 < _len. // XC_MEMZERO_256 may be used to clear shorter areas. // // The code // - first zeroes a few bytes to align on a HeapWord. // This step is currently inactive because all calls seem // to have their data aligned on HeapWord boundaries. // - then zeroes a few HeapWords to align on a cache line. // - then zeroes entire cache lines in a loop. // - then zeroes the remaining (partial) cache line. #if 1 #define XC_MEMZERO_ANY(_to,_len) \
{ unsignedlong toaddr; unsignedlong tolen; \ unsignedlong len8; unsignedlong len256; \ unsignedlong target; unsignedlong lenx; \ asm("\t" \ " LTGR %[tolen],%[len] \n\t"/* */ \ " BRC 8,2f \n\t"/* do nothing for l=0*/ \ " LG %[toaddr],%[to] \n\t"/* addr of data area */ \ " LARL %[target],1f \n\t"/* addr of XC instr */ \ " " \ " LCGR %[len256],%[toaddr] \n\t"/* cache line alignment */\ " NILL %[len256],0xff \n\t" \ " BRC 8,4f \n\t"/* already aligned */ \ " NILH %[len256],0x00 \n\t"/* zero extend */ \ " LLGFR %[len256],%[len256] \n\t" \ " LAY %[lenx],-1(,%[len256]) \n\t" \ " EX %[lenx],0(%[target]) \n\t"/* execute MVC instr */ \ " LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \ " SGR %[tolen],%[len256] \n\t"/* adjust len */ \ " " \ "4: SRAG %[lenx],%[tolen],8 \n\t"/* # cache lines */ \ " BRC 8,6f \n\t"/* no full cache lines */ \ "5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \ " LA %[toaddr],256(,%[toaddr]) \n\t" \ " BRCTG %[lenx],5b \n\t"/* iterate */ \ " " \ "6: NILL %[tolen],0xff \n\t"/* leftover bytes */ \ " BRC 8,2f \n\t"/* done if none */ \ " LAY %[lenx],-1(,%[tolen]) \n\t" \ " EX %[lenx],0(%[target]) \n\t"/* execute MVC instr */ \ " BRC 15,2f \n\t"/* skip template */ \ " " \ "1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ "2: BCR 0,0 \n\t"/* nop a branch target */ \
: [to] "+Q" (_to) /* outputs */ \
, [lenx] "=a" (lenx) \
, [len256] "=a" (len256) \
, [tolen] "=a" (tolen) \
, [toaddr] "=a" (toaddr) \
, [target] "=a" (target) \
: [len] "r" (_len) /* inputs */ \
: "cc"/* clobbered */ \
); \
} #else #define XC_MEMZERO_ANY(_to,_len) \
{ unsignedlong toaddr; unsignedlong tolen; \ unsignedlong len8; unsignedlong len256; \ unsignedlong target; unsignedlong lenx; \ asm("\t" \ " LTGR %[tolen],%[len] \n\t"/* */ \ " BRC 8,2f \n\t"/* do nothing for l=0*/ \ " LG %[toaddr],%[to] \n\t"/* addr of data area */ \ " LARL %[target],1f \n\t"/* addr of XC instr */ \ " " \ " LCGR %[len8],%[toaddr] \n\t"/* HeapWord alignment */ \ " NILL %[len8],0x07 \n\t" \ " BRC 8,3f \n\t"/* already aligned */ \ " NILH %[len8],0x00 \n\t"/* zero extend */ \ " LLGFR %[len8],%[len8] \n\t" \ " LAY %[lenx],-1(,%[len8]) \n\t" \ " EX %[lenx],0(%[target]) \n\t"/* execute MVC instr */ \ " LA %[toaddr],0(%[len8],%[toaddr]) \n\t" \ " SGR %[tolen],%[len8] \n\t"/* adjust len */ \ " " \ "3: LCGR %[len256],%[toaddr] \n\t"/* cache line alignment */\ " NILL %[len256],0xff \n\t" \ " BRC 8,4f \n\t"/* already aligned */ \ " NILH %[len256],0x00 \n\t"/* zero extend */ \ " LLGFR %[len256],%[len256] \n\t" \ " LAY %[lenx],-1(,%[len256]) \n\t" \ " EX %[lenx],0(%[target]) \n\t"/* execute MVC instr */ \ " LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \ " SGR %[tolen],%[len256] \n\t"/* adjust len */ \ " " \ "4: SRAG %[lenx],%[tolen],8 \n\t"/* # cache lines */ \ " BRC 8,6f \n\t"/* no full cache lines */ \ "5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \ " LA %[toaddr],256(,%[toaddr]) \n\t" \ " BRCTG %[lenx],5b \n\t"/* iterate */ \ " " \ "6: NILL %[tolen],0xff \n\t"/* leftover bytes */ \ " BRC 8,2f \n\t"/* done if none */ \ " LAY %[lenx],-1(,%[tolen]) \n\t" \ " EX %[lenx],0(%[target]) \n\t"/* execute MVC instr */ \ " BRC 15,2f \n\t"/* skip template */ \ " " \ "1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ "2: BCR 0,0 \n\t"/* nop a branch target */ \
: [to] "+Q" (_to) /* outputs */ \
, [lenx] "=a" (lenx) \
, [len8] "=a" (len8) \
, [len256] "=a" (len256) \
, [tolen] "=a" (tolen) \
, [toaddr] "=a" (toaddr) \
, [target] "=a" (target) \
: [len] "r" (_len) /* inputs */ \
: "cc"/* clobbered */ \
); \
} #endif #endif// USE_INLINE_ASM
//*************************************// // D I S J O I N T C O P Y I N G // //*************************************//
staticvoid pd_aligned_disjoint_words(const HeapWord* from, HeapWord* to, size_t count) { // JVM2008: very frequent, some tests frequent.
// Copy HeapWord (=DW) aligned storage. Use MVCLE in inline-asm code. // MVCLE guarantees DW concurrent (i.e. atomic) accesses if both the addresses of the operands // are DW aligned and the length is an integer multiple of a DW. Should always be true here. // // No special exploit needed. H/W discovers suitable situations itself. // // For large chunks of memory, exploit special H/W support of z/Architecture: // 1) copy short piece of memory to page-align address(es) // 2) copy largest part (all contained full pages) of memory using mvcle instruction. // z/Architecture processors have special H/W support for page-aligned storage // where len is an int multiple of page size. In that case, up to 4 cache lines are // processed in parallel and L1 cache is not polluted. // 3) copy the remaining piece of memory. // #ifdef USE_INLINE_ASM
jbyte* to_bytes = (jbyte*)to;
jbyte* from_bytes = (jbyte*)from;
size_t len_bytes = count*HeapWordSize;
// Optimized copying for data less than 4k switch (count) { case 0: return; case 1: MOVE8_ATOMIC_1(to,from) return; case 2: MOVE8_ATOMIC_2(to,from) return; // case 3: MOVE8_ATOMIC_3(to,from) // return; // case 4: MOVE8_ATOMIC_4(to,from) // return; default: if (len_bytes <= 4096) {
MVC_MULTI(to,from,len_bytes) return;
} // else
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) return;
} #else // Fallback code. switch (count) { case 0: return;
//**************************************************// // C O N J O I N T A T O M I C C O P Y I N G // //**************************************************//
staticvoid pd_conjoint_bytes_atomic(constvoid* from, void* to, size_t count) { // Call arraycopy stubs to do the job.
pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically.
}
staticvoid pd_conjoint_jshorts_atomic(const jshort* from, jshort* to, size_t count) {
#ifdef USE_INLINE_ASM
size_t count_in = count; if (has_destructive_overlap((constchar*)from, (char*)to, count_in*BytesPerShort)) { // Use optimizations from shared code where no z-specific optimization exists.
copy_conjoint_jshorts_atomic(from, to, count);
} else {
jbyte* to_bytes = (jbyte*)to;
jbyte* from_bytes = (jbyte*)from;
size_t len_bytes = count_in*BytesPerShort;
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
} #else // Use optimizations from shared code where no z-specific optimization exists.
copy_conjoint_jshorts_atomic(from, to, count); #endif
}
staticvoid pd_conjoint_jints_atomic(const jint* from, jint* to, size_t count) {
#ifdef USE_INLINE_ASM
size_t count_in = count; if (has_destructive_overlap((constchar*)from, (char*)to, count_in*BytesPerInt)) { switch (count_in) { case 4: COPY4_ATOMIC_4(to,from) return; case 3: COPY4_ATOMIC_3(to,from) return; case 2: COPY4_ATOMIC_2(to,from) return; case 1: COPY4_ATOMIC_1(to,from) return; case 0: return; default: // Use optimizations from shared code where no z-specific optimization exists.
copy_conjoint_jints_atomic(from, to, count_in); return;
}
} // else
jbyte* to_bytes = (jbyte*)to;
jbyte* from_bytes = (jbyte*)from;
size_t len_bytes = count_in*BytesPerInt;
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) #else // Use optimizations from shared code where no z-specific optimization exists.
copy_conjoint_jints_atomic(from, to, count); #endif
}
staticvoid pd_conjoint_jlongs_atomic(const jlong* from, jlong* to, size_t count) {
#ifdef USE_INLINE_ASM
size_t count_in = count; if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { switch (count_in) { case 4: COPY8_ATOMIC_4(to,from) return; case 3: COPY8_ATOMIC_3(to,from) return; case 2: COPY8_ATOMIC_2(to,from) return; case 1: COPY8_ATOMIC_1(to,from) return; case 0: return; default:
from += count_in;
to += count_in; while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively. return;
}
} // else {
jbyte* to_bytes = (jbyte*)to;
jbyte* from_bytes = (jbyte*)from;
size_t len_bytes = count_in*BytesPerLong;
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) #else
size_t count_in = count; if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { if (count_in < 8) {
from += count_in;
to += count_in; while (count_in-- > 0)
*(--to) = *(--from); // Copy backwards, areas overlap destructively. return;
} // else {
from += count_in-1;
to += count_in-1; if (count_in&0x01) {
*(to--) = *(from--);
count_in--;
} for (; count_in>0; count_in-=2) {
*to = *from;
*(to-1) = *(from-1);
to -= 2;
from -= 2;
}
} else
pd_aligned_disjoint_words((const HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate. #endif
}
staticvoid pd_conjoint_oops_atomic(const oop* from, oop* to, size_t count) {
#ifdef USE_INLINE_ASM
size_t count_in = count; if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) { switch (count_in) { case 4: COPY8_ATOMIC_4(to,from) return; case 3: COPY8_ATOMIC_3(to,from) return; case 2: COPY8_ATOMIC_2(to,from) return; case 1: COPY8_ATOMIC_1(to,from) return; case 0: return; default:
from += count_in;
to += count_in; while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively. return;
}
} // else
jbyte* to_bytes = (jbyte*)to;
jbyte* from_bytes = (jbyte*)from;
size_t len_bytes = count_in*BytesPerOop;
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) #else
size_t count_in = count; if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
from += count_in;
to += count_in; while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively. return;
} // else
pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate. return; #endif
}
staticvoid pd_arrayof_conjoint_bytes(const HeapWord* from, HeapWord* to, size_t count) {
pd_conjoint_bytes_atomic(from, to, count);
}
//**********************************************// // M E M O R Y I N I T I A L I S A T I O N // //**********************************************//
staticvoid pd_fill_to_bytes(void* to, size_t count, jubyte value) { // JVM2008: very rare, only in some tests. #ifdef USE_INLINE_ASM // Initialize storage to a given value. Use memset instead of copy loop. // For large chunks of memory, exploit special H/W support of z/Architecture: // 1) init short piece of memory to page-align address // 2) init largest part (all contained full pages) of memory using mvcle instruction. // z/Architecture processors have special H/W support for page-aligned storage // where len is an int multiple of page size. In that case, up to 4 cache lines are // processed in parallel and L1 cache is not polluted. // 3) init the remaining piece of memory. // Atomicity cannot really be an issue since gcc implements the loop body with XC anyway. // If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm.
#else // Memset does the best job possible: loop over 256-byte MVCs, with // the last MVC EXecuted. With the -mmvcle option, initialization // is done using MVCLE -> slight advantage for large areas.
(void)memset(to, value, count); #endif
}
staticvoid pd_fill_to_words(HeapWord* tohw, size_t count, juint value) { // Occurs in dbg builds only. Usually memory poisoning with BAADBABE, DEADBEEF, etc. // JVM2008: < 4k calls. if (value == 0) {
pd_zero_to_words(tohw, count); return;
} if (value == ~(juint)(0)) {
pd_fill_to_bytes(tohw, count*HeapWordSize, (jubyte)(~(juint)(0))); return;
}
julong* to = (julong*) tohw;
julong v = ((julong) value << 32) | value; while (count-- > 0) {
*to++ = v;
}
}
staticvoid pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) { // JVM2008: very frequent, but virtually all calls are with value == 0.
pd_fill_to_words(tohw, count, value);
}
//**********************************// // M E M O R Y C L E A R I N G // //**********************************//
// Delegate to pd_zero_to_bytes. It also works HeapWord-atomic. // Distinguish between simple and large zero_to_words. staticvoid pd_zero_to_words(HeapWord* tohw, size_t count) {
pd_zero_to_bytes(tohw, count*HeapWordSize);
}
staticvoid pd_zero_to_bytes(void* to, size_t count) { // JVM2008: some calls (generally), some tests frequent #ifdef USE_INLINE_ASM // Even zero_to_bytes() requires HeapWord-atomic, or, at least, sequential // zeroing of the memory. MVCLE is not fit for that job: // "As observed by other CPUs and by the channel subsystem, // that portion of the first operand which is filled // with the padding byte is not necessarily stored into in // a left-to-right direction and may appear to be stored // into more than once." // Therefore, implementation was changed to use (multiple) XC instructions.
#else // Memset does the best job possible: loop over 256-byte MVCs, with // the last MVC EXecuted. With the -mmvcle option, initialization // is done using MVCLE -> slight advantage for large areas.
(void)memset(to, 0, count); #endif
}
#endif// CPU_S390_COPY_S390_HPP
¤ Dauer der Verarbeitung: 0.31 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.