/* * We need the ability to prevent aribtration (MI_ARB_ON_OFF), * the ability to write PTE using inline data (MI_STORE_DATA) * and of course the ability to do the block transfer (blits).
*/
GEM_BUG_ON(engine->class != COPY_ENGINE_CLASS);
/* * Insert a dummy PTE into every PT that will map to LMEM to ensure * we have a correctly setup PDE structure for later use.
*/
vm->insert_page(vm, 0, d->offset,
i915_gem_get_pat_index(vm->i915, I915_CACHE_NONE),
PTE_LM);
GEM_BUG_ON(!pt->is_compact);
d->offset += SZ_2M;
}
/* * We are playing tricks here, since the actual pt, from the hw * pov, is only 256bytes with 32 entries, or 4096bytes with 512 * entries, but we are still guaranteed that the physical * alignment is 64K underneath for the pt, and we are careful * not to access the space in the void.
*/
vm->insert_page(vm, px_dma(pt), d->offset,
i915_gem_get_pat_index(vm->i915, I915_CACHE_NONE),
PTE_LM);
d->offset += SZ_64K;
}
staticstruct i915_address_space *migrate_vm(struct intel_gt *gt)
{ struct i915_vm_pt_stash stash = {}; struct i915_ppgtt *vm; int err; int i;
/* * We construct a very special VM for use by all migration contexts, * it is kept pinned so that it can be used at any time. As we need * to pre-allocate the page directories for the migration VM, this * limits us to only using a small number of prepared vma. * * To be able to pipeline and reschedule migration operations while * avoiding unnecessary contention on the vm itself, the PTE updates * are inline with the blits. All the blits use the same fixed * addresses, with the backing store redirection being updated on the * fly. Only 2 implicit vma are used for all migration operations. * * We lay the ppGTT out as: * * [0, CHUNK_SZ) -> first object * [CHUNK_SZ, 2 * CHUNK_SZ) -> second object * [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE * * By exposing the dma addresses of the page directories themselves * within the ppGTT, we are then able to rewrite the PTE prior to use. * But the PTE update and subsequent migration operation must be atomic, * i.e. within the same non-preemptible window so that we do not switch * to another migration context that overwrites the PTE. * * This changes quite a bit on platforms with HAS_64K_PAGES support, * where we instead have three windows, each CHUNK_SIZE in size. The * first is reserved for mapping system-memory, and that just uses the * 512 entry layout using 4K GTT pages. The other two windows just map * lmem pages and must use the new compact 32 entry layout using 64K GTT * pages, which ensures we can address any lmem object that the user * throws at us. We then also use the xehp_toggle_pdes as a way of * just toggling the PDE bit(GEN12_PDE_64K) for us, to enable the * compact layout for each of these page-tables, that fall within the * [CHUNK_SIZE, 3 * CHUNK_SIZE) range. * * We lay the ppGTT out as: * * [0, CHUNK_SZ) -> first window/object, maps smem * [CHUNK_SZ, 2 * CHUNK_SZ) -> second window/object, maps lmem src * [2 * CHUNK_SZ, 3 * CHUNK_SZ) -> third window/object, maps lmem dst * * For the PTE window it's also quite different, since each PTE must * point to some 64K page, one for each PT(since it's in lmem), and yet * each is only <= 4096bytes, but since the unused space within that PTE * range is never touched, this should be fine. * * So basically each PT now needs 64K of virtual memory, instead of 4K, * which looks like: * * [3 * CHUNK_SZ, 3 * CHUNK_SZ + ((3 * CHUNK_SZ / SZ_2M) * SZ_64K)] -> PTE
*/
vm = i915_ppgtt_create(gt, I915_BO_ALLOC_PM_EARLY); if (IS_ERR(vm)) return ERR_CAST(vm);
if (HAS_64K_PAGES(gt->i915))
stash.pt_sz = I915_GTT_PAGE_SIZE_64K;
/* * Each engine instance is assigned its own chunk in the VM, so * that we can run multiple instances concurrently
*/ for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) { struct intel_engine_cs *engine;
u64 base = (u64)i << 32; struct insert_pte_data d = {}; struct i915_gem_ww_ctx ww;
u64 sz;
engine = gt->engine_class[COPY_ENGINE_CLASS][i]; if (!engine_supports_migration(engine)) continue;
/* * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need * 4x2 page directories for source/destination.
*/ if (HAS_64K_PAGES(gt->i915))
sz = 3 * CHUNK_SZ; else
sz = 2 * CHUNK_SZ;
d.offset = base + sz;
/* * We need another page directory setup so that we can write * the 8x512 PTE in each chunk.
*/ if (HAS_64K_PAGES(gt->i915))
sz += (sz / SZ_2M) * SZ_64K; else
sz += (sz >> 12) * sizeof(u64);
err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz); if (err) goto err_vm;
for_i915_gem_ww(&ww, err, true) {
err = i915_vm_lock_objects(&vm->vm, &ww); if (err) continue;
err = i915_vm_map_pt_stash(&vm->vm, &stash); if (err) continue;
for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
engine = gt->engine_class[COPY_ENGINE_CLASS][i]; if (engine_supports_migration(engine)) return engine;
}
/* * We randomly distribute contexts across the engines upon construction, * as they all share the same pinned vm, and so in order to allow * multiple blits to run in parallel, we must construct each blit * to use a different range of the vm for its GTT. This has to be * known at construction, so we can not use the late greedy load * balancing of the virtual-engine.
*/
ce = __migrate_engines(m->context->engine->gt); if (IS_ERR(ce)) return ce;
/** * DOC: Flat-CCS - Memory compression for Local memory * * On Xe-HP and later devices, we use dedicated compression control state (CCS) * stored in local memory for each surface, to support the 3D and media * compression formats. * * The memory required for the CCS of the entire local memory is 1/256 of the * local memory size. So before the kernel boot, the required memory is reserved * for the CCS data and a secure register will be programmed with the CCS base * address. * * Flat CCS data needs to be cleared when a lmem object is allocated. * And CCS data can be copied in and out of CCS region through * XY_CTRL_SURF_COPY_BLT. CPU can't access the CCS data directly. * * I915 supports Flat-CCS on lmem only objects. When an objects has smem in * its preference list, on memory pressure, i915 needs to migrate the lmem * content into smem. If the lmem object is Flat-CCS compressed by userspace, * then i915 needs to decompress it. But I915 lack the required information * for such decompression. Hence I915 supports Flat-CCS only on lmem only objects. * * When we exhaust the lmem, Flat-CCS capable objects' lmem backing memory can * be temporarily evicted to smem, along with the auxiliary CCS state, where * it can be potentially swapped-out at a later point, if required. * If userspace later touches the evicted pages, then we always move * the backing memory back to lmem, which includes restoring the saved CCS state, * and potentially performing any required swap-in. * * For the migration of the lmem objects with smem in placement list, such as * {lmem, smem}, objects are treated as non Flat-CCS capable objects.
*/
/* * The XY_CTRL_SURF_COPY_BLT instruction is used to copy the CCS * data in and out of the CCS region. * * We can copy at most 1024 blocks of 256 bytes using one * XY_CTRL_SURF_COPY_BLT instruction. * * In case we need to copy more than 1024 blocks, we need to add * another instruction to the same batch buffer. * * 1024 blocks of 256 bytes of CCS represent a total 256KB of CCS. * * 256 KB of CCS represents 256 * 256 KB = 64 MB of LMEM.
*/
*cs++ = XY_CTRL_SURF_COPY_BLT |
src_access << SRC_ACCESS_TYPE_SHIFT |
dst_access << DST_ACCESS_TYPE_SHIFT |
((num_ccs_blks - 1) & CCS_SIZE_MASK) << CCS_SIZE_SHIFT;
*cs++ = src_offset;
*cs++ = rq->engine->instance |
FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
*cs++ = dst_offset;
*cs++ = rq->engine->instance |
FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, mocs);
static u64 scatter_list_length(struct scatterlist *sg)
{
u64 len = 0;
while (sg && sg_dma_len(sg)) {
len += sg_dma_len(sg);
sg = sg_next(sg);
}
return len;
}
staticint
calculate_chunk_sz(struct drm_i915_private *i915, bool src_is_lmem,
u64 bytes_to_cpy, u64 ccs_bytes_to_cpy)
{ if (ccs_bytes_to_cpy && !src_is_lmem) /* * When CHUNK_SZ is passed all the pages upto CHUNK_SZ * will be taken for the blt. in Flat-ccs supported * platform Smem obj will have more pages than required * for main memory hence limit it to the required size * for main memory
*/ return min_t(u64, bytes_to_cpy, CHUNK_SZ); else return CHUNK_SZ;
}
/* * When there is a eviction of ccs needed smem will have the * extra pages for the ccs data * * TO-DO: Want to move the size mismatch check to a WARN_ON, * but still we have some requests of smem->lmem with same size. * Need to fix it.
*/
ccs_bytes_to_cpy = src_sz != dst_sz ? GET_CCS_BYTES(i915, bytes_to_cpy) : 0; if (ccs_bytes_to_cpy)
get_ccs_sg_sgt(&it_ccs, bytes_to_cpy);
}
if (src_is_lmem) { /* * If the src is already in lmem, then we must * be doing an lmem -> lmem transfer, and so * should be safe to directly copy the CCS * state. In this case we have either * initialised the CCS aux state when first * clearing the pages (since it is already * allocated in lmem), or the user has * potentially populated it, in which case we * need to copy the CCS state as-is.
*/
err = emit_copy_ccs(rq,
dst_offset, INDIRECT_ACCESS,
src_offset, INDIRECT_ACCESS,
len);
} else { /* * While we can't always restore/manage the CCS * state, we still need to ensure we don't leak * the CCS state from the previous user, so make * sure we overwrite it with something.
*/
err = emit_copy_ccs(rq,
dst_offset, INDIRECT_ACCESS,
dst_offset, DIRECT_ACCESS,
len);
}
if (err) goto out_rq;
err = rq->engine->emit_flush(rq, EMIT_INVALIDATE); if (err) goto out_rq;
}
/* Arbitration is re-enabled between requests. */
out_rq: if (*out)
i915_request_put(*out);
*out = i915_request_get(rq);
i915_request_add(rq);
if (err) break;
if (!bytes_to_cpy && !ccs_bytes_to_cpy) { if (src_is_lmem)
WARN_ON(it_src.sg && sg_dma_len(it_src.sg)); else
WARN_ON(it_dst.sg && sg_dma_len(it_dst.sg)); break;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.