/* * Although MI_STORE_DATA_IMM's "length" field is 10-bits, 0x3FE is the largest * legal value accepted. Since that instruction field is always stored in * (val-2) format, this translates to 0x400 dwords for the true maximum length * of the instruction. Subtracting the instruction header (1 dword) and * address (2 dwords), that leaves 0x3FD dwords (0x1FE qwords) for PTE values.
*/ #define MAX_PTE_PER_SDI 0x1FEU
/** * xe_tile_migrate_exec_queue() - Get this tile's migrate exec queue. * @tile: The tile. * * Returns the default migrate exec queue of this tile. * * Return: The default migrate exec queue
*/ struct xe_exec_queue *xe_tile_migrate_exec_queue(struct xe_tile *tile)
{ return tile->migrate->q;
}
/* First slot is reserved for mapping of PT bo and bb, start from 1 */ return (slot + 1ULL) << xe_pt_shift(level + 1);
}
static u64 xe_migrate_vram_ofs(struct xe_device *xe, u64 addr, bool is_comp_pte)
{ /* * Remove the DPA to get a correct offset into identity table for the * migrate offset
*/
u64 identity_offset = IDENTITY_OFFSET;
/* * Use 1GB pages when possible, last chunk always use 2M * pages as mixing reserved memory (stolen, WOCPM) with a single * mapping is not allowed on certain platforms.
*/ for (pos = dpa_base; pos < vram_limit;
pos += SZ_1G, ofs += 8) { if (pos + SZ_1G >= vram_limit) {
entry = vm->pt_ops->pde_encode_bo(bo, pt_2m_ofs);
xe_map_wr(xe, &bo->vmap, ofs, u64, entry);
/* Can't bump NUM_PT_SLOTS too high */
BUILD_BUG_ON(NUM_PT_SLOTS > SZ_2M/XE_PAGE_SIZE); /* Must be a multiple of 64K to support all platforms */
BUILD_BUG_ON(NUM_PT_SLOTS * XE_PAGE_SIZE % SZ_64K); /* And one slot reserved for the 4KiB page table updates */
BUILD_BUG_ON(!(NUM_KERNEL_PDE & 1));
/* Need to be sure everything fits in the first PT, or create more */
xe_tile_assert(tile, m->batch_base_ofs + xe_bo_size(batch) < SZ_2M);
bo = xe_bo_create_pin_map(vm->xe, tile, vm,
num_entries * XE_PAGE_SIZE,
ttm_bo_type_kernel,
XE_BO_FLAG_VRAM_IF_DGFX(tile) |
XE_BO_FLAG_PAGETABLE); if (IS_ERR(bo)) return PTR_ERR(bo);
/* Map the entire BO in our level 0 pt */ for (i = 0, level = 0; i < num_entries; level++) {
entry = vm->pt_ops->pte_encode_bo(bo, i * XE_PAGE_SIZE,
pat_index, 0);
if (vm->flags & XE_VM_FLAG_64K)
i += 16; else
i += 1;
}
if (!IS_DGFX(xe)) { /* Write out batch too */
m->batch_base_ofs = NUM_PT_SLOTS * XE_PAGE_SIZE; for (i = 0; i < xe_bo_size(batch);
i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE :
XE_PAGE_SIZE) {
entry = vm->pt_ops->pte_encode_bo(batch, i,
pat_index, 0);
for (i = 0; i < xe_bo_size(batch);
i += vm->flags & XE_VM_FLAG_64K ? XE_64K_PAGE_SIZE :
XE_PAGE_SIZE) {
entry = vm->pt_ops->pte_encode_bo(batch, i,
pat_index, 0);
/* * Example layout created above, with root level = 3: * [PT0...PT7]: kernel PT's for copy/clear; 64 or 4KiB PTE's * [PT8]: Kernel PT for VM_BIND, 4 KiB PTE's * [PT9...PT26]: Userspace PT's for VM_BIND, 4 KiB PTE's * [PT27 = PDE 0] [PT28 = PDE 1] [PT29 = PDE 2] [PT30 & PT31 = 2M vram identity map] * * This makes the lowest part of the VM point to the pagetables. * Hence the lowest 2M in the vm should point to itself, with a few writes * and flushes, other parts of the VM can be used either for copying and * clearing. * * For performance, the kernel reserves PDE's, so about 20 are left * for async VM updates. * * To make it easier to work, each scratch PT is put in slot (1 + PT #) * everywhere, this allows lockless updates to scratch pages by using * the different addresses in VM.
*/ #define NUM_VMUSA_UNIT_PER_PAGE 32 #define VM_SA_UPDATE_UNIT_SIZE (XE_PAGE_SIZE / NUM_VMUSA_UNIT_PER_PAGE) #define NUM_VMUSA_WRITES_PER_UNIT (VM_SA_UPDATE_UNIT_SIZE / sizeof(u64))
drm_suballoc_manager_init(&m->vm_update_sa,
(size_t)(map_ofs / XE_PAGE_SIZE - NUM_KERNEL_PDE) *
NUM_VMUSA_UNIT_PER_PAGE, 0);
m->pt_bo = bo; return 0;
}
/* * Including the reserved copy engine is required to avoid deadlocks due to * migrate jobs servicing the faults gets stuck behind the job that faulted.
*/ static u32 xe_migrate_usm_logical_mask(struct xe_gt *gt)
{
u32 logical_mask = 0; struct xe_hw_engine *hwe; enum xe_hw_engine_id id;
for_each_hw_engine(hwe, gt, id) { if (hwe->class != XE_ENGINE_CLASS_COPY) continue;
if (xe_gt_is_usm_hwe(gt, hwe))
logical_mask |= BIT(hwe->logical_instance);
}
if (mem_type_is_vram(cur->mem_type)) { /* * VRAM we want to blit in chunks with sizes aligned to * min_chunk_size in order for the offset to CCS metadata to be * page-aligned. If it's the last chunk it may be smaller. * * Another constraint is that we need to limit the blit to * the VRAM block size, unless size is smaller than * min_chunk_size.
*/
u64 chunk = max_t(u64, cur->size, m->min_chunk_size);
if (!copy_ccs && dst_is_indirect) { /* * If the src is already in vram, then it should already * have been cleared by us, or has been populated by the * user. Make sure we copy the CCS aux state as-is. * * Otherwise if the bo doesn't have any CCS metadata attached, * we still need to clear it for security reasons.
*/
u64 ccs_src_ofs = src_is_indirect ? src_ofs : m->cleared_mem_ofs;
/** * xe_migrate_copy() - Copy content of TTM resources. * @m: The migration context. * @src_bo: The buffer object @src is currently bound to. * @dst_bo: If copying between resources created for the same bo, set this to * the same value as @src_bo. If copying between buffer objects, set it to * the buffer object @dst is currently bound to. * @src: The source TTM resource. * @dst: The dst TTM resource. * @copy_only_ccs: If true copy only CCS metadata * * Copies the contents of @src to @dst: On flat CCS devices, * the CCS metadata is copied as well if needed, or if not present, * the CCS metadata of @dst is cleared for security reasons. * * Return: Pointer to a dma_fence representing the last copy batch, or * an error pointer on failure. If there is a failure, any copy operation * started by the function call has been synced.
*/ struct dma_fence *xe_migrate_copy(struct xe_migrate *m, struct xe_bo *src_bo, struct xe_bo *dst_bo, struct ttm_resource *src, struct ttm_resource *dst, bool copy_only_ccs)
{ struct xe_gt *gt = m->tile->primary_gt; struct xe_device *xe = gt_to_xe(gt); struct dma_fence *fence = NULL;
u64 size = xe_bo_size(src_bo); struct xe_res_cursor src_it, dst_it, ccs_it;
u64 src_L0_ofs, dst_L0_ofs;
u32 src_L0_pt, dst_L0_pt;
u64 src_L0, dst_L0; int pass = 0; int err; bool src_is_pltt = src->mem_type == XE_PL_TT; bool dst_is_pltt = dst->mem_type == XE_PL_TT; bool src_is_vram = mem_type_is_vram(src->mem_type); bool dst_is_vram = mem_type_is_vram(dst->mem_type); bool type_device = src_bo->ttm.type == ttm_bo_type_device; bool needs_ccs_emit = type_device && xe_migrate_needs_ccs_emit(xe); bool copy_ccs = xe_device_has_flat_ccs(xe) &&
xe_bo_needs_ccs_pages(src_bo) && xe_bo_needs_ccs_pages(dst_bo); bool copy_system_ccs = copy_ccs && (!src_is_vram || !dst_is_vram); bool use_comp_pat = type_device && xe_device_has_flat_ccs(xe) &&
GRAPHICS_VER(xe) >= 20 && src_is_vram && !dst_is_vram;
/* Copying CCS between two different BOs is not supported yet. */ if (XE_WARN_ON(copy_ccs && src_bo != dst_bo)) return ERR_PTR(-EINVAL);
if (src_bo != dst_bo && XE_WARN_ON(xe_bo_size(src_bo) != xe_bo_size(dst_bo))) return ERR_PTR(-EINVAL);
staticbool has_service_copy_support(struct xe_gt *gt)
{ /* * What we care about is whether the architecture was designed with * service copy functionality (specifically the new MEM_SET / MEM_COPY * instructions) so check the architectural engine list rather than the * actual list since these instructions are usable on BCS0 even if * all of the actual service copy engines (BCS1-BCS8) have been fused * off.
*/ return gt->info.engine_mask & GENMASK(XE_HW_ENGINE_BCS8,
XE_HW_ENGINE_BCS1);
}
/** * xe_migrate_clear() - Copy content of TTM resources. * @m: The migration context. * @bo: The buffer object @dst is currently bound to. * @dst: The dst TTM resource to be cleared. * @clear_flags: flags to specify which data to clear: CCS, BO, or both. * * Clear the contents of @dst to zero when XE_MIGRATE_CLEAR_FLAG_BO_DATA is set. * On flat CCS devices, the CCS metadata is cleared to zero with XE_MIGRATE_CLEAR_FLAG_CCS_DATA. * Set XE_MIGRATE_CLEAR_FLAG_FULL to clear bo as well as CCS metadata. * TODO: Eliminate the @bo argument. * * Return: Pointer to a dma_fence representing the last clear batch, or * an error pointer on failure. If there is a failure, any clear operation * started by the function call has been synced.
*/ struct dma_fence *xe_migrate_clear(struct xe_migrate *m, struct xe_bo *bo, struct ttm_resource *dst,
u32 clear_flags)
{ bool clear_vram = mem_type_is_vram(dst->mem_type); bool clear_bo_data = XE_MIGRATE_CLEAR_FLAG_BO_DATA & clear_flags; bool clear_ccs = XE_MIGRATE_CLEAR_FLAG_CCS_DATA & clear_flags; struct xe_gt *gt = m->tile->primary_gt; struct xe_device *xe = gt_to_xe(gt); bool clear_only_system_ccs = false; struct dma_fence *fence = NULL;
u64 size = xe_bo_size(bo); struct xe_res_cursor src_it; struct ttm_resource *src = dst; int err;
if (WARN_ON(!clear_bo_data && !clear_ccs)) return NULL;
if (!clear_bo_data && clear_ccs && !IS_DGFX(xe))
clear_only_system_ccs = true;
size -= clear_L0; /* Preemption is enabled again by the ring ops. */ if (clear_vram && xe_migrate_allow_identity(clear_L0, &src_it))
xe_res_next(&src_it, clear_L0); else
emit_pte(m, bb, clear_L0_pt, clear_vram, clear_only_system_ccs,
&src_it, clear_L0, dst);
xe_sched_job_add_migrate_flush(job, flush_flags); if (!fence) { /* * There can't be anything userspace related at this * point, so we just need to respect any potential move * fences, which are always tracked as * DMA_RESV_USAGE_KERNEL.
*/
err = xe_sched_job_add_deps(job, bo->ttm.base.resv,
DMA_RESV_USAGE_KERNEL); if (err) goto err_job;
}
/* * If we have 512 entries (max), we would populate it ourselves, * and update the PDE above it to the new pointer. * The only time this can only happen if we have to update the top * PDE. This requires a BO that is almost vm->size big. * * This shouldn't be possible in practice.. might change when 16K * pages are used. Hence the assert.
*/
xe_tile_assert(tile, update->qwords < MAX_NUM_PTE); if (!ppgtt_ofs)
ppgtt_ofs = xe_migrate_vram_ofs(tile_to_xe(tile),
xe_bo_addr(update->pt_bo, 0,
XE_PAGE_SIZE), false);
do {
u64 addr = ppgtt_ofs + ofs * 8;
chunk = min(size, MAX_PTE_PER_SDI);
/* Ensure populatefn can do memset64 by aligning bb->cs */ if (!(bb->len & 1))
bb->cs[bb->len++] = MI_NOOP;
/** * xe_migrate_update_pgtables() - Pipelined page-table update * @m: The migrate context. * @pt_update: PT update arguments * * Perform a pipelined page-table update. The update descriptors are typically * built under the same lock critical section as a call to this function. If * using the default engine for the updates, they will be performed in the * order they grab the job_mutex. If different engines are used, external * synchronization is needed for overlapping updates to maintain page-table * consistency. Note that the meaning of "overlapping" is that the updates * touch the same page-table, which might be a higher-level page-directory. * If no pipelining is needed, then updates may be performed by the cpu. * * Return: A dma_fence that, when signaled, indicates the update completion.
*/ struct dma_fence *
xe_migrate_update_pgtables(struct xe_migrate *m, struct xe_migrate_pt_update *pt_update)
/** * xe_migrate_wait() - Complete all operations using the xe_migrate context * @m: Migrate context to wait for. * * Waits until the GPU no longer uses the migrate context's default engine * or its page-table objects. FIXME: What about separate page-table update * engines?
*/ void xe_migrate_wait(struct xe_migrate *m)
{ if (m->fence)
dma_fence_wait(m->fence, false);
}
/* * MI_STORE_DATA_IMM command is used to update page table. Each * instruction can update maximumly MAX_PTE_PER_SDI pte entries. To * update n (n <= MAX_PTE_PER_SDI) pte entries, we need: * * - 1 dword for the MI_STORE_DATA_IMM command header (opcode etc) * - 2 dword for the page table's physical location * - 2*n dword for value of pte to fill (each pte entry is 2 dwords)
*/
num_dword = (1 + 2) * DIV_U64_ROUND_UP(entries, MAX_PTE_PER_SDI);
num_dword += entries * 2;
/** * xe_migrate_access_memory - Access memory of a BO via GPU * * @m: The migration context. * @bo: buffer object * @offset: access offset into buffer object * @buf: pointer to caller memory to read into or write from * @len: length of access * @write: write access * * Access memory of a BO via GPU either reading in or writing from a passed in * pointer. Pointer is dma mapped for GPU access and GPU commands are issued to * read to or write from pointer. * * Returns: * 0 if successful, negative error code on failure.
*/ int xe_migrate_access_memory(struct xe_migrate *m, struct xe_bo *bo, unsignedlong offset, void *buf, int len, int write)
{ struct xe_tile *tile = m->tile; struct xe_device *xe = tile_to_xe(tile); struct xe_res_cursor cursor; struct dma_fence *fence = NULL;
dma_addr_t *dma_addr; unsignedlong page_offset = (unsignedlong)buf & ~PAGE_MASK; int bytes_left = len, current_page = 0; void *orig_buf = buf;
xe_bo_assert_held(bo);
/* Use bounce buffer for small access and unaligned access */ if (!IS_ALIGNED(len, XE_CACHELINE_BYTES) ||
!IS_ALIGNED((unsignedlong)buf + offset, XE_CACHELINE_BYTES)) { int buf_offset = 0; void *bounce; int err;
BUILD_BUG_ON(!is_power_of_2(XE_CACHELINE_BYTES));
bounce = kmalloc(XE_CACHELINE_BYTES, GFP_KERNEL); if (!bounce) return -ENOMEM;
/* * Less than ideal for large unaligned access but this should be * fairly rare, can fixup if this becomes common.
*/ do { int copy_bytes = min_t(int, bytes_left,
XE_CACHELINE_BYTES -
(offset & XE_CACHELINE_MASK)); int ptr_offset = offset & XE_CACHELINE_MASK;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.