// SPDX-License-Identifier: MIT /* * Copyright 2014-2018 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE.
*/ #include <linux/dma-buf.h> #include <linux/list.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <drm/ttm/ttm_tt.h>
/* Userptr restore delay, just long enough to allow consecutive VM * changes to accumulate
*/ #define AMDGPU_USERPTR_RESTORE_DELAY_MS 1 #define AMDGPU_RESERVE_MEM_LIMIT (3UL << 29)
/* * Align VRAM availability to 2MB to avoid fragmentation caused by 4K allocations in the tail 2MB * BO chunk
*/ #define VRAM_AVAILABLITY_ALIGN (1 << 21)
/* Impose limit on how much memory KFD can use */ staticstruct {
uint64_t max_system_mem_limit;
uint64_t max_ttm_mem_limit;
int64_t system_mem_used;
int64_t ttm_mem_used;
spinlock_t mem_limit_lock;
} kfd_mem_limit;
list_for_each_entry(entry, &mem->attachments, list) if (entry->bo_va->base.vm == avm) returntrue;
returnfalse;
}
/** * reuse_dmamap() - Check whether adev can share the original * userptr BO * * If both adev and bo_adev are in direct mapping or * in the same iommu group, they can share the original BO. * * @adev: Device to which can or cannot share the original BO * @bo_adev: Device to which allocated BO belongs to * * Return: returns true if adev can share original userptr BO, * false otherwise.
*/ staticbool reuse_dmamap(struct amdgpu_device *adev, struct amdgpu_device *bo_adev)
{ return (adev->ram_is_direct_mapped && bo_adev->ram_is_direct_mapped) ||
(adev->dev->iommu_group == bo_adev->dev->iommu_group);
}
/* Set memory usage limits. Current, limits are * System (TTM + userptr) memory - 15/16th System RAM * TTM memory - 3/8th System RAM
*/ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
{ struct sysinfo si;
uint64_t mem;
if (kfd_mem_limit.max_system_mem_limit) return;
si_meminfo(&si);
mem = si.totalram - si.totalhigh;
mem *= si.mem_unit;
/* Estimate page table size needed to represent a given memory size * * With 4KB pages, we need one 8 byte PTE for each 4KB of memory * (factor 512, >> 9). With 2MB pages, we need one 8 byte PTE for 2MB * of memory (factor 256K, >> 18). ROCm user mode tries to optimize * for 2MB pages for TLB efficiency. However, small allocations and * fragmented system memory still need some 4KB pages. We choose a * compromise that should work in most cases without reserving too * much memory for page tables unnecessarily (factor 16K, >> 14).
*/
/** * amdgpu_amdkfd_reserve_mem_limit() - Decrease available memory by size * of buffer. * * @adev: Device to which allocated BO belongs to * @size: Size of buffer, in bytes, encapsulated by B0. This should be * equivalent to amdgpu_bo_size(BO) * @alloc_flag: Flag used in allocating a BO as noted above * @xcp_id: xcp_id is used to get xcp from xcp manager, one xcp is * managed as one compute node in driver for app * * Return: * returns -ENOMEM in case of error, ZERO otherwise
*/ int amdgpu_amdkfd_reserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag, int8_t xcp_id)
{
uint64_t reserved_for_pt =
ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
uint64_t reserved_for_ras = (con ? con->reserved_pages_in_bytes : 0);
size_t system_mem_needed, ttm_mem_needed, vram_needed; int ret = 0;
uint64_t vram_size = 0;
system_mem_needed = 0;
ttm_mem_needed = 0;
vram_needed = 0; if (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
system_mem_needed = size;
ttm_mem_needed = size;
} elseif (alloc_flag & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { /* * Conservatively round up the allocation requirement to 2 MB * to avoid fragmentation caused by 4K allocations in the tail * 2M BO chunk.
*/
vram_needed = size; /* * For GFX 9.4.3, get the VRAM size from XCP structs
*/ if (WARN_ONCE(xcp_id < 0, "invalid XCP ID %d", xcp_id)) return -EINVAL;
if (kfd_mem_limit.system_mem_used + system_mem_needed >
kfd_mem_limit.max_system_mem_limit) {
pr_debug("Set no_system_mem_limit=1 if using shared memory\n"); if (!no_system_mem_limit) {
ret = -ENOMEM; goto release;
}
}
if (kfd_mem_limit.ttm_mem_used + ttm_mem_needed >
kfd_mem_limit.max_ttm_mem_limit) {
ret = -ENOMEM; goto release;
}
/*if is_app_apu is false and apu_prefer_gtt is true, it is an APU with * carve out < gtt. In that case, VRAM allocation will go to gtt domain, skip * VRAM check since ttm_mem_limit check already cover this allocation
*/
/** * create_dmamap_sg_bo() - Creates a amdgpu_bo object to reflect information * about USERPTR or DOOREBELL or MMIO BO. * * @adev: Device for which dmamap BO is being created * @mem: BO of peer device that is being DMA mapped. Provides parameters * in building the dmamap BO * @bo_out: Output parameter updated with handle of dmamap BO
*/ staticint
create_dmamap_sg_bo(struct amdgpu_device *adev, struct kgd_mem *mem, struct amdgpu_bo **bo_out)
{ struct drm_gem_object *gem_obj; int ret;
uint64_t flags = 0;
ret = amdgpu_bo_reserve(mem->bo, false); if (ret) return ret;
/* amdgpu_amdkfd_remove_eviction_fence - Removes eviction fence from BO's * reservation object. * * @bo: [IN] Remove eviction fence(s) from this BO * @ef: [IN] This eviction fence is removed if it * is present in the shared list. * * NOTE: Must be called with BO reserved i.e. bo->tbo.resv->lock held.
*/ staticint amdgpu_amdkfd_remove_eviction_fence(struct amdgpu_bo *bo, struct amdgpu_amdkfd_fence *ef)
{ struct dma_fence *replacement;
if (!ef) return -EINVAL;
/* TODO: Instead of block before we should use the fence of the page * table update and TLB flush here directly.
*/
replacement = dma_fence_get_stub();
dma_resv_replace_fences(bo->tbo.base.resv, ef->base.context,
replacement, DMA_RESV_USAGE_BOOKKEEP);
dma_fence_put(replacement); return 0;
}
/** * amdgpu_amdkfd_remove_all_eviction_fences - Remove all eviction fences * @bo: the BO where to remove the evictions fences from. * * This functions should only be used on release when all references to the BO * are already dropped. We remove the eviction fence from the private copy of * the dma_resv object here since that is what is used during release to * determine of the BO is idle or not.
*/ void amdgpu_amdkfd_remove_all_eviction_fences(struct amdgpu_bo *bo)
{ struct dma_resv *resv = &bo->tbo.base._resv; struct dma_fence *fence, *stub; struct dma_resv_iter cursor;
/** * create_sg_table() - Create an sg_table for a contiguous DMA addr range * @addr: The starting address to point to * @size: Size of memory area in bytes being pointed to * * Allocates an instance of sg_table and initializes it to point to memory * area specified by input parameters. The address used to build is assumed * to be DMA mapped, if needed. * * DOORBELL or MMIO BOs use only one scatterlist node in their sg_table * because they are physically contiguous. * * Return: Initialized instance of SG Table or NULL
*/ staticstruct sg_table *create_sg_table(uint64_t addr, uint32_t size)
{ struct sg_table *sg = kmalloc(sizeof(*sg), GFP_KERNEL);
if (WARN_ON(ttm->num_pages != src_ttm->num_pages)) return -EINVAL;
ttm->sg = kmalloc(sizeof(*ttm->sg), GFP_KERNEL); if (unlikely(!ttm->sg)) return -ENOMEM;
/* Same sequence as in amdgpu_ttm_tt_pin_userptr */
ret = sg_alloc_table_from_pages(ttm->sg, src_ttm->pages,
ttm->num_pages, 0,
(u64)ttm->num_pages << PAGE_SHIFT,
GFP_KERNEL); if (unlikely(ret)) goto free_sg;
ret = dma_map_sgtable(adev->dev, ttm->sg, direction, 0); if (unlikely(ret)) goto release_sg;
amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); if (ret) goto unmap_sg;
/** * kfd_mem_dmamap_sg_bo() - Create DMA mapped sg_table to access DOORBELL or MMIO BO * @mem: SG BO of the DOORBELL or MMIO resource on the owning device * @attachment: Virtual address attachment of the BO on accessing device * * An access request from the device that owns DOORBELL does not require DMA mapping. * This is because the request doesn't go through PCIe root complex i.e. it instead * loops back. The need to DMA map arises only when accessing peer device's DOORBELL * * In contrast, all access requests for MMIO need to be DMA mapped without regard to * device ownership. This is because access requests for MMIO go through PCIe root * complex. * * This is accomplished in two steps: * - Obtain DMA mapped address of DOORBELL or MMIO memory that could be used * in updating requesting device's page table * - Signal TTM to mark memory pointed to by requesting device's BO as GPU * accessible. This allows an update of requesting device's page table * with entries associated with DOOREBELL or MMIO memory * * This method is invoked in the following contexts: * - Mapping of DOORBELL or MMIO BO of same or peer device * - Validating an evicted DOOREBELL or MMIO BO on device seeking access * * Return: ZERO if successful, NON-ZERO otherwise
*/ staticint
kfd_mem_dmamap_sg_bo(struct kgd_mem *mem, struct kfd_mem_attachment *attachment)
{ struct ttm_operation_ctx ctx = {.interruptible = true}; struct amdgpu_bo *bo = attachment->bo_va->base.bo; struct amdgpu_device *adev = attachment->adev; struct ttm_tt *ttm = bo->tbo.ttm; enum dma_data_direction dir;
dma_addr_t dma_addr; bool mmio; int ret;
/* Expect SG Table of dmapmap BO to be NULL */
mmio = (mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP); if (unlikely(ttm->sg)) {
pr_err("SG Table of %d BO for peer device is UNEXPECTEDLY NON-NULL", mmio); return -EINVAL;
}
dir = mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ?
DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
dma_addr = mem->bo->tbo.sg->sgl->dma_address;
pr_debug("%d BO size: %d\n", mmio, mem->bo->tbo.sg->sgl->length);
pr_debug("%d BO address before DMA mapping: %llx\n", mmio, dma_addr);
dma_addr = dma_map_resource(adev->dev, dma_addr,
mem->bo->tbo.sg->sgl->length, dir, DMA_ATTR_SKIP_CPU_SYNC);
ret = dma_mapping_error(adev->dev, dma_addr); if (unlikely(ret)) return ret;
pr_debug("%d BO address after DMA mapping: %llx\n", mmio, dma_addr);
ttm->sg = create_sg_table(dma_addr, mem->bo->tbo.sg->sgl->length); if (unlikely(!ttm->sg)) {
ret = -ENOMEM; goto unmap_sg;
}
amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); if (unlikely(ret)) goto free_sg;
staticvoid
kfd_mem_dmaunmap_dmabuf(struct kfd_mem_attachment *attachment)
{ /* This is a no-op. We don't want to trigger eviction fences when * unmapping DMABufs. Therefore the invalidation (moving to system * domain) is done in kfd_mem_dmamap_dmabuf.
*/
}
/** * kfd_mem_dmaunmap_sg_bo() - Free DMA mapped sg_table of DOORBELL or MMIO BO * @mem: SG BO of the DOORBELL or MMIO resource on the owning device * @attachment: Virtual address attachment of the BO on accessing device * * The method performs following steps: * - Signal TTM to mark memory pointed to by BO as GPU inaccessible * - Free SG Table that is used to encapsulate DMA mapped memory of * peer device's DOORBELL or MMIO memory * * This method is invoked in the following contexts: * UNMapping of DOORBELL or MMIO BO on a device having access to its memory * Eviction of DOOREBELL or MMIO BO on device having access to its memory * * Return: void
*/ staticvoid
kfd_mem_dmaunmap_sg_bo(struct kgd_mem *mem, struct kfd_mem_attachment *attachment)
{ struct ttm_operation_ctx ctx = {.interruptible = true}; struct amdgpu_bo *bo = attachment->bo_va->base.bo; struct amdgpu_device *adev = attachment->adev; struct ttm_tt *ttm = bo->tbo.ttm; enum dma_data_direction dir;
if (unlikely(!ttm->sg)) {
pr_debug("SG Table of BO is NULL"); return;
}
/* kfd_mem_attach - Add a BO to a VM * * Everything that needs to bo done only once when a BO is first added * to a VM. It can later be mapped and unmapped many times without * repeating these steps. * * 0. Create BO for DMA mapping, if needed * 1. Allocate and initialize BO VA entry data structure * 2. Add BO to the VM * 3. Determine ASIC-specific PTE flags * 4. Alloc page tables and directories if needed * 4a. Validate new page tables and directories
*/ staticint kfd_mem_attach(struct amdgpu_device *adev, struct kgd_mem *mem, struct amdgpu_vm *vm, bool is_aql)
{ struct amdgpu_device *bo_adev = amdgpu_ttm_adev(mem->bo->tbo.bdev); unsignedlong bo_size = mem->bo->tbo.base.size;
uint64_t va = mem->va; struct kfd_mem_attachment *attachment[2] = {NULL, NULL}; struct amdgpu_bo *bo[2] = {NULL, NULL}; struct amdgpu_bo_va *bo_va; bool same_hive = false; int i, ret;
if (!va) {
pr_err("Invalid VA when adding BO to VM\n"); return -EINVAL;
}
/* Determine access to VRAM, MMIO and DOORBELL BOs of peer devices * * The access path of MMIO and DOORBELL BOs of is always over PCIe. * In contrast the access path of VRAM BOs depens upon the type of * link that connects the peer device. Access over PCIe is allowed * if peer device has large BAR. In contrast, access over xGMI is * allowed for both small and large BAR configurations of peer device
*/ if ((adev != bo_adev && !adev->apu_prefer_gtt) &&
((mem->domain == AMDGPU_GEM_DOMAIN_VRAM) ||
(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) ||
(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP))) { if (mem->domain == AMDGPU_GEM_DOMAIN_VRAM)
same_hive = amdgpu_xgmi_same_hive(adev, bo_adev); if (!same_hive && !amdgpu_device_is_peer_accessible(bo_adev, adev)) return -EINVAL;
}
for (i = 0; i <= is_aql; i++) {
attachment[i] = kzalloc(sizeof(*attachment[i]), GFP_KERNEL); if (unlikely(!attachment[i])) {
ret = -ENOMEM; goto unwind;
}
pr_debug("\t add VA 0x%llx - 0x%llx to vm %p\n", va,
va + bo_size, vm);
if ((adev == bo_adev && !(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) ||
(amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm) && reuse_dmamap(adev, bo_adev)) ||
(mem->domain == AMDGPU_GEM_DOMAIN_GTT && reuse_dmamap(adev, bo_adev)) ||
same_hive) { /* Mappings on the local GPU, or VRAM mappings in the * local hive, or userptr, or GTT mapping can reuse dma map * address space share the original BO
*/
attachment[i]->type = KFD_MEM_ATT_SHARED;
bo[i] = mem->bo;
drm_gem_object_get(&bo[i]->tbo.base);
} elseif (i > 0) { /* Multiple mappings on the same GPU share the BO */
attachment[i]->type = KFD_MEM_ATT_SHARED;
bo[i] = bo[0];
drm_gem_object_get(&bo[i]->tbo.base);
} elseif (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) { /* Create an SG BO to DMA-map userptrs on other GPUs */
attachment[i]->type = KFD_MEM_ATT_USERPTR;
ret = create_dmamap_sg_bo(adev, mem, &bo[i]); if (ret) goto unwind; /* Handle DOORBELL BOs of peer devices and MMIO BOs of local and peer devices */
} elseif (mem->bo->tbo.type == ttm_bo_type_sg) {
WARN_ONCE(!(mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL ||
mem->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP), "Handing invalid SG BO in ATTACH request");
attachment[i]->type = KFD_MEM_ATT_SG;
ret = create_dmamap_sg_bo(adev, mem, &bo[i]); if (ret) goto unwind; /* Enable acces to GTT and VRAM BOs of peer devices */
} elseif (mem->domain == AMDGPU_GEM_DOMAIN_GTT ||
mem->domain == AMDGPU_GEM_DOMAIN_VRAM) {
attachment[i]->type = KFD_MEM_ATT_DMABUF;
ret = kfd_mem_attach_dmabuf(adev, mem, &bo[i]); if (ret) goto unwind;
pr_debug("Employ DMABUF mechanism to enable peer GPU access\n");
} else {
WARN_ONCE(true, "Handling invalid ATTACH request");
ret = -EINVAL; goto unwind;
}
/* Add BO to VM internal data structures */
ret = amdgpu_bo_reserve(bo[i], false); if (ret) {
pr_debug("Unable to reserve BO during memory attach"); goto unwind;
}
bo_va = amdgpu_vm_bo_find(vm, bo[i]); if (!bo_va)
bo_va = amdgpu_vm_bo_add(adev, vm, bo[i]); else
++bo_va->ref_count;
attachment[i]->bo_va = bo_va;
amdgpu_bo_unreserve(bo[i]); if (unlikely(!attachment[i]->bo_va)) {
ret = -ENOMEM;
pr_err("Failed to add BO object to VM. ret == %d\n",
ret); goto unwind;
}
attachment[i]->va = va;
attachment[i]->pte_flags = get_pte_flags(adev, mem);
attachment[i]->adev = adev;
list_add(&attachment[i]->list, &mem->attachments);
va += bo_size;
}
return 0;
unwind: for (; i >= 0; i--) { if (!attachment[i]) continue; if (attachment[i]->bo_va) {
(void)amdgpu_bo_reserve(bo[i], true); if (--attachment[i]->bo_va->ref_count == 0)
amdgpu_vm_bo_del(adev, attachment[i]->bo_va);
amdgpu_bo_unreserve(bo[i]);
list_del(&attachment[i]->list);
} if (bo[i])
drm_gem_object_put(&bo[i]->tbo.base);
kfree(attachment[i]);
} return ret;
}
/* Initializes user pages. It registers the MMU notifier and validates * the userptr BO in the GTT domain. * * The BO must already be on the userptr_valid_list. Otherwise an * eviction and restore may happen that leaves the new BO unmapped * with the user mode queues running. * * Takes the process_info->lock to protect against concurrent restore * workers. * * Returns 0 for success, negative errno for errors.
*/ staticint init_user_pages(struct kgd_mem *mem, uint64_t user_addr, bool criu_resume)
{ struct amdkfd_process_info *process_info = mem->process_info; struct amdgpu_bo *bo = mem->bo; struct ttm_operation_ctx ctx = { true, false }; struct hmm_range *range; int ret = 0;
mutex_lock(&process_info->lock);
ret = amdgpu_ttm_tt_set_userptr(&bo->tbo, user_addr, 0); if (ret) {
pr_err("%s: Failed to set userptr: %d\n", __func__, ret); goto out;
}
ret = amdgpu_hmm_register(bo, user_addr); if (ret) {
pr_err("%s: Failed to register MMU notifier: %d\n",
__func__, ret); goto out;
}
if (criu_resume) { /* * During a CRIU restore operation, the userptr buffer objects * will be validated in the restore_userptr_work worker at a * later stage when it is scheduled by another ioctl called by * CRIU master process for the target pid for restore.
*/
mutex_lock(&process_info->notifier_lock);
mem->invalid++;
mutex_unlock(&process_info->notifier_lock);
mutex_unlock(&process_info->lock); return 0;
}
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages, &range); if (ret) { if (ret == -EAGAIN)
pr_debug("Failed to get user pages, try again\n"); else
pr_err("%s: Failed to get user pages: %d\n", __func__, ret); goto unregister_out;
}
ret = amdgpu_bo_reserve(bo, true); if (ret) {
pr_err("%s: Failed to reserve BO\n", __func__); goto release_out;
}
amdgpu_bo_placement_from_domain(bo, mem->domain);
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); if (ret)
pr_err("%s: failed to validate BO\n", __func__);
amdgpu_bo_unreserve(bo);
/* Reserving a BO and its page table BOs must happen atomically to * avoid deadlocks. Some operations update multiple VMs at once. Track * all the reservation info in a context structure. Optionally a sync * object can track VM updates.
*/ struct bo_vm_reservation_context { /* DRM execution context for the reservation */ struct drm_exec exec; /* Number of VMs reserved */ unsignedint n_vms; /* Pointer to sync object */ struct amdgpu_sync *sync;
};
enum bo_vm_match {
BO_VM_NOT_MAPPED = 0, /* Match VMs where a BO is not mapped */
BO_VM_MAPPED, /* Match VMs where a BO is mapped */
BO_VM_ALL, /* Match all VMs a BO was added to */
};
/** * reserve_bo_and_vm - reserve a BO and a VM unconditionally. * @mem: KFD BO structure. * @vm: the VM to reserve. * @ctx: the struct that will be used in unreserve_bo_and_vms().
*/ staticint reserve_bo_and_vm(struct kgd_mem *mem, struct amdgpu_vm *vm, struct bo_vm_reservation_context *ctx)
{ struct amdgpu_bo *bo = mem->bo; int ret;
ret = drm_exec_prepare_obj(&ctx->exec, &bo->tbo.base, 1);
drm_exec_retry_on_contention(&ctx->exec); if (unlikely(ret)) goto error;
} return 0;
error:
pr_err("Failed to reserve buffers in ttm.\n");
drm_exec_fini(&ctx->exec); return ret;
}
/** * reserve_bo_and_cond_vms - reserve a BO and some VMs conditionally * @mem: KFD BO structure. * @vm: the VM to reserve. If NULL, then all VMs associated with the BO * is used. Otherwise, a single VM associated with the BO. * @map_type: the mapping status that will be used to filter the VMs. * @ctx: the struct that will be used in unreserve_bo_and_vms(). * * Returns 0 for success, negative for failure.
*/ staticint reserve_bo_and_cond_vms(struct kgd_mem *mem, struct amdgpu_vm *vm, enum bo_vm_match map_type, struct bo_vm_reservation_context *ctx)
{ struct kfd_mem_attachment *entry; struct amdgpu_bo *bo = mem->bo; int ret;
ret = amdgpu_vm_lock_pd(entry->bo_va->base.vm,
&ctx->exec, 2);
drm_exec_retry_on_contention(&ctx->exec); if (unlikely(ret)) goto error;
++ctx->n_vms;
}
ret = drm_exec_prepare_obj(&ctx->exec, &bo->tbo.base, 1);
drm_exec_retry_on_contention(&ctx->exec); if (unlikely(ret)) goto error;
} return 0;
error:
pr_err("Failed to reserve buffers in ttm.\n");
drm_exec_fini(&ctx->exec); return ret;
}
/** * unreserve_bo_and_vms - Unreserve BO and VMs from a reservation context * @ctx: Reservation context to unreserve * @wait: Optionally wait for a sync object representing pending VM updates * @intr: Whether the wait is interruptible * * Also frees any resources allocated in * reserve_bo_and_(cond_)vm(s). Returns the status from * amdgpu_sync_wait.
*/ staticint unreserve_bo_and_vms(struct bo_vm_reservation_context *ctx, bool wait, bool intr)
{ int ret = 0;
if (wait)
ret = amdgpu_sync_wait(ctx->sync, intr);
/* Set virtual address for the allocation */
ret = amdgpu_vm_bo_map(entry->adev, entry->bo_va, entry->va, 0,
amdgpu_bo_size(entry->bo_va->base.bo),
entry->pte_flags); if (ret) {
pr_err("Failed to map VA 0x%llx in vm. ret %d\n",
entry->va, ret); return ret;
}
if (no_update_pte) return 0;
ret = update_gpuvm_pte(mem, entry, sync); if (ret) {
pr_err("update_gpuvm_pte() failed\n"); goto update_gpuvm_pte_failed;
}
/* Validate page directory and attach eviction fence */
ret = amdgpu_bo_reserve(vm->root.bo, true); if (ret) goto reserve_pd_fail;
ret = vm_validate_pt_pd_bos(vm, NULL); if (ret) {
pr_err("validate_pt_pd_bos() failed\n"); goto validate_pd_fail;
}
ret = amdgpu_bo_sync_wait(vm->root.bo,
AMDGPU_FENCE_OWNER_KFD, false); if (ret) goto wait_pd_fail;
ret = dma_resv_reserve_fences(vm->root.bo->tbo.base.resv, 1); if (ret) goto reserve_shared_fail;
dma_resv_add_fence(vm->root.bo->tbo.base.resv,
&vm->process_info->eviction_fence->base,
DMA_RESV_USAGE_BOOKKEEP);
amdgpu_bo_unreserve(vm->root.bo);
/* Update process info */
mutex_lock(&vm->process_info->lock);
list_add_tail(&vm->vm_list_node,
&(vm->process_info->vm_list_head));
vm->process_info->n_vms++; if (ef)
*ef = dma_fence_get(&vm->process_info->eviction_fence->base);
mutex_unlock(&vm->process_info->lock);
/** * amdgpu_amdkfd_gpuvm_pin_bo() - Pins a BO using following criteria * @bo: Handle of buffer object being pinned * @domain: Domain into which BO should be pinned * * - USERPTR BOs are UNPINNABLE and will return error * - All other BO types (GTT, VRAM, MMIO and DOORBELL) will have their * PIN count incremented. It is valid to PIN a BO multiple times * * Return: ZERO if successful in pinning, Non-Zero in case of error.
*/ staticint amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)
{ int ret = 0;
ret = amdgpu_bo_reserve(bo, false); if (unlikely(ret)) return ret;
if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) { /* * If bo is not contiguous on VRAM, move to system memory first to ensure * we can get contiguous VRAM space after evicting other BOs.
*/ if (!(bo->tbo.resource->placement & TTM_PL_FLAG_CONTIGUOUS)) { struct ttm_operation_ctx ctx = { true, false };
amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_GTT);
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); if (unlikely(ret)) {
pr_debug("validate bo 0x%p to GTT failed %d\n", &bo->tbo, ret); goto out;
}
}
}
ret = amdgpu_bo_pin(bo, domain); if (ret)
pr_err("Error in Pinning BO to domain: %d\n", domain);
/** * amdgpu_amdkfd_gpuvm_unpin_bo() - Unpins BO using following criteria * @bo: Handle of buffer object being unpinned * * - Is a illegal request for USERPTR BOs and is ignored * - All other BO types (GTT, VRAM, MMIO and DOORBELL) will have their * PIN count decremented. Calls to UNPIN must balance calls to PIN
*/ staticvoid amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
{ int ret = 0;
ret = amdgpu_bo_reserve(bo, false); if (unlikely(ret)) return;
amdgpu_bo_unpin(bo);
amdgpu_bo_unreserve(bo);
}
int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, struct amdgpu_vm *avm, void **process_info, struct dma_fence **ef)
{ int ret;
/* Already a compute VM? */ if (avm->process_info) return -EINVAL;
/* Convert VM into a compute VM */
ret = amdgpu_vm_make_compute(adev, avm); if (ret) return ret;
/* Initialize KFD part of the VM and process info */
ret = init_kfd_vm(avm, process_info, ef); if (ret) return ret;
/* Update process info */
mutex_lock(&process_info->lock);
process_info->n_vms--;
list_del(&vm->vm_list_node);
mutex_unlock(&process_info->lock);
vm->process_info = NULL;
/* Release per-process resources when last compute VM is destroyed */ if (!process_info->n_vms) {
WARN_ON(!list_empty(&process_info->kfd_bo_list));
WARN_ON(!list_empty(&process_info->userptr_valid_list));
WARN_ON(!list_empty(&process_info->userptr_inval_list));
/* Workaround for AQL queue wraparound bug. Map the same * memory twice. That means we only actually allocate half * the memory.
*/ if ((*mem)->aql_queue)
size >>= 1;
aligned_size = PAGE_ALIGN(size);
(*mem)->alloc_flags = flags;
amdgpu_sync_create(&(*mem)->sync);
ret = amdgpu_amdkfd_reserve_mem_limit(adev, aligned_size, flags,
xcp_id); if (ret) {
pr_debug("Insufficient memory\n"); goto err_reserve_limit;
}
pr_debug("\tcreate BO VA 0x%llx size 0x%llx domain %s xcp_id %d\n",
va, (*mem)->aql_queue ? size << 1 : size,
domain_string(alloc_domain), xcp_id);
ret = amdgpu_gem_object_create(adev, aligned_size, 1, alloc_domain, alloc_flags,
bo_type, NULL, &gobj, xcp_id + 1); if (ret) {
pr_debug("Failed to create BO on domain %s. ret %d\n",
domain_string(alloc_domain), ret); goto err_bo_create;
}
ret = drm_vma_node_allow(&gobj->vma_node, drm_priv); if (ret) {
pr_debug("Failed to allow vma node access. ret %d\n", ret); goto err_node_allow;
}
ret = drm_gem_handle_create(adev->kfd.client.file, gobj, &(*mem)->gem_handle); if (ret) goto err_gem_handle_create;
bo = gem_to_amdgpu_bo(gobj); if (bo_type == ttm_bo_type_sg) {
bo->tbo.sg = sg;
bo->tbo.ttm->sg = sg;
}
bo->kfd_bo = *mem;
(*mem)->bo = bo; if (user_addr)
bo->flags |= AMDGPU_AMDKFD_CREATE_USERPTR_BO;
/* Unpin MMIO/DOORBELL BO's that were pinned during allocation */ if (mem->alloc_flags &
(KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)) {
amdgpu_amdkfd_gpuvm_unpin_bo(mem->bo);
}
mapped_to_gpu_memory = mem->mapped_to_gpu_memory;
is_imported = mem->is_imported;
mutex_unlock(&mem->lock); /* lock is not needed after this, since mem is unused and will * be freed anyway
*/
if (mapped_to_gpu_memory > 0) {
pr_debug("BO VA 0x%llx size 0x%lx is still mapped.\n",
mem->va, bo_size); return -EBUSY;
}
/* Make sure restore workers don't access the BO any more */
mutex_lock(&process_info->lock);
list_del(&mem->validate_list);
mutex_unlock(&process_info->lock);
/* Cleanup user pages and MMU notifiers */ if (amdgpu_ttm_tt_get_usermm(mem->bo->tbo.ttm)) {
amdgpu_hmm_unregister(mem->bo);
mutex_lock(&process_info->notifier_lock);
amdgpu_ttm_tt_discard_user_pages(mem->bo->tbo.ttm, mem->range);
mutex_unlock(&process_info->notifier_lock);
}
ret = reserve_bo_and_cond_vms(mem, NULL, BO_VM_ALL, &ctx); if (unlikely(ret)) return ret;
/* Remove from VM internal data structures */
list_for_each_entry_safe(entry, tmp, &mem->attachments, list) {
kfd_mem_dmaunmap_attachment(mem, entry);
kfd_mem_detach(entry);
}
ret = unreserve_bo_and_vms(&ctx, false, false);
/* Free the sync object */
amdgpu_sync_free(&mem->sync);
/* If the SG is not NULL, it's one we created for a doorbell or mmio * remap BO. We need to free it.
*/ if (mem->bo->tbo.sg) {
sg_free_table(mem->bo->tbo.sg);
kfree(mem->bo->tbo.sg);
}
/* Update the size of the BO being freed if it was allocated from * VRAM and is not imported. For APP APU VRAM allocations are done * in GTT domain
*/ if (size) { if (!is_imported &&
(mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_VRAM ||
(adev->apu_prefer_gtt &&
mem->bo->preferred_domains == AMDGPU_GEM_DOMAIN_GTT)))
*size = bo_size; else
*size = 0;
}
/* Free the BO*/
drm_vma_node_revoke(&mem->bo->tbo.base.vma_node, drm_priv);
drm_gem_handle_delete(adev->kfd.client.file, mem->gem_handle); if (mem->dmabuf) {
dma_buf_put(mem->dmabuf);
mem->dmabuf = NULL;
}
mutex_destroy(&mem->lock);
/* If this releases the last reference, it will end up calling * amdgpu_amdkfd_release_notify and kfree the mem struct. That's why * this needs to be the last call here.
*/
drm_gem_object_put(&mem->bo->tbo.base);
/* * For kgd_mem allocated in amdgpu_amdkfd_gpuvm_import_dmabuf(), * explicitly free it here.
*/ if (!use_release_notifier)
kfree(mem);
bo = mem->bo; if (!bo) {
pr_err("Invalid BO when mapping memory to GPU\n"); return -EINVAL;
}
/* Make sure restore is not running concurrently. Since we * don't map invalid userptr BOs, we rely on the next restore * worker to do the mapping
*/
mutex_lock(&mem->process_info->lock);
/* Lock notifier lock. If we find an invalid userptr BO, we can be * sure that the MMU notifier is no longer running * concurrently and the queues are actually stopped
*/ if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
mutex_lock(&mem->process_info->notifier_lock);
is_invalid_userptr = !!mem->invalid;
mutex_unlock(&mem->process_info->notifier_lock);
}
pr_debug("Map VA 0x%llx - 0x%llx to vm %p domain %s\n",
mem->va,
mem->va + bo_size * (1 + mem->aql_queue),
avm, domain_string(domain));
if (!kfd_mem_is_attached(avm, mem)) {
ret = kfd_mem_attach(adev, mem, avm, mem->aql_queue); if (ret) goto out;
}
ret = reserve_bo_and_vm(mem, avm, &ctx); if (unlikely(ret)) goto out;
/* Userptr can be marked as "not invalid", but not actually be * validated yet (still in the system domain). In that case * the queues are still stopped and we can leave mapping for * the next restore worker
*/ if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm) &&
bo->tbo.resource->mem_type == TTM_PL_SYSTEM)
is_invalid_userptr = true;
ret = vm_validate_pt_pd_bos(avm, NULL); if (unlikely(ret)) goto out_unreserve;
ret = reserve_bo_and_cond_vms(mem, avm, BO_VM_MAPPED, &ctx); if (unlikely(ret)) goto out; /* If no VMs were reserved, it means the BO wasn't actually mapped */ if (ctx.n_vms == 0) {
ret = -EINVAL; goto unreserve_out;
}
ret = vm_validate_pt_pd_bos(avm, NULL); if (unlikely(ret)) goto unreserve_out;
pr_debug("Unmap VA 0x%llx - 0x%llx from vm %p\n",
mem->va,
mem->va + bo_size * (1 + mem->aql_queue),
avm);
ret = amdgpu_sync_wait(&sync, intr);
amdgpu_sync_free(&sync); return ret;
}
/** * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count * @bo: Buffer object to be mapped * @bo_gart: Return bo reference * * Before return, bo reference count is incremented. To release the reference and unpin/ * unmap the BO, call amdgpu_amdkfd_free_gtt_mem.
*/ int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_bo *bo, struct amdgpu_bo **bo_gart)
{ int ret;
ret = amdgpu_bo_reserve(bo, true); if (ret) {
pr_err("Failed to reserve bo. ret %d\n", ret); goto err_reserve_bo_failed;
}
ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT); if (ret) {
pr_err("Failed to pin bo. ret %d\n", ret); goto err_pin_bo_failed;
}
ret = amdgpu_ttm_alloc_gart(&bo->tbo); if (ret) {
pr_err("Failed to bind bo to GART. ret %d\n", ret); goto err_map_bo_gart_failed;
}
/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Map a GTT BO for kernel CPU access * * @mem: Buffer object to be mapped for CPU access * @kptr[out]: pointer in kernel CPU address space * @size[out]: size of the buffer * * Pins the BO and maps it for kernel CPU access. The eviction fence is removed * from the BO, since pinned BOs cannot be evicted. The bo must remain on the * validate_list, so the GPU mapping can be restored after a page table was * evicted. * * Return: 0 on success, error code on failure
*/ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, void **kptr, uint64_t *size)
{ int ret; struct amdgpu_bo *bo = mem->bo;
if (amdgpu_ttm_tt_get_usermm(bo->tbo.ttm)) {
pr_err("userptr can't be mapped to kernel\n"); return -EINVAL;
}
mutex_lock(&mem->process_info->lock);
ret = amdgpu_bo_reserve(bo, true); if (ret) {
pr_err("Failed to reserve bo. ret %d\n", ret); goto bo_reserve_failed;
}
ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT); if (ret) {
pr_err("Failed to pin bo. ret %d\n", ret); goto pin_failed;
}
ret = amdgpu_bo_kmap(bo, kptr); if (ret) {
pr_err("Failed to map bo to kernel. ret %d\n", ret); goto kmap_failed;
}
/** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Unmap a GTT BO for kernel CPU access * * @mem: Buffer object to be unmapped for CPU access * * Removes the kernel CPU mapping and unpins the BO. It does not restore the * eviction fence, so this function should only be used for cleanup before the * BO is destroyed.
*/ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem)
{ struct amdgpu_bo *bo = mem->bo;
bo = gem_to_amdgpu_bo(obj); if (!(bo->preferred_domains & (AMDGPU_GEM_DOMAIN_VRAM |
AMDGPU_GEM_DOMAIN_GTT))) /* Only VRAM and GTT BOs are supported */ return -EINVAL;
*mem = kzalloc(sizeof(struct kgd_mem), GFP_KERNEL); if (!*mem) return -ENOMEM;
ret = drm_vma_node_allow(&obj->vma_node, drm_priv); if (ret) goto err_free_mem;
if (size)
*size = amdgpu_bo_size(bo);
if (mmap_offset)
*mmap_offset = amdgpu_bo_mmap_offset(bo);
mutex_lock(&avm->process_info->lock); if (avm->process_info->eviction_fence &&
!dma_fence_is_signaled(&avm->process_info->eviction_fence->base))
ret = amdgpu_amdkfd_bo_validate_and_fence(bo, (*mem)->domain,
&avm->process_info->eviction_fence->base);
mutex_unlock(&avm->process_info->lock); if (ret) goto err_remove_mem;
/* Evict a userptr BO by stopping the queues if necessary * * Runs in MMU notifier, may be in RECLAIM_FS context. This means it * cannot do any memory allocations, and cannot take any locks that * are held elsewhere while allocating memory. * * It doesn't do anything to the BO itself. The real work happens in * restore, where we get updated page addresses. This function only * ensures that GPU access to the BO is stopped.
*/ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni, unsignedlong cur_seq, struct kgd_mem *mem)
{ struct amdkfd_process_info *process_info = mem->process_info; int r = 0;
/* Do not process MMU notifications during CRIU restore until * KFD_CRIU_OP_RESUME IOCTL is received
*/ if (READ_ONCE(process_info->block_mmu_notifications)) return 0;
mem->invalid++; if (++process_info->evicted_bos == 1) { /* First eviction, stop the queues */
r = kgd2kfd_quiesce_mm(mni->mm,
KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
if (r && r != -ESRCH)
pr_err("Failed to quiesce KFD\n");
if (r != -ESRCH)
queue_delayed_work(system_freezable_wq,
&process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
}
mutex_unlock(&process_info->notifier_lock);
return r;
}
/* Update invalid userptr BOs * * Moves invalidated (evicted) userptr BOs from userptr_valid_list to * userptr_inval_list and updates user pages for all BOs that have * been invalidated since their last update.
*/ staticint update_invalid_user_pages(struct amdkfd_process_info *process_info, struct mm_struct *mm)
{ struct kgd_mem *mem, *tmp_mem; struct amdgpu_bo *bo; struct ttm_operation_ctx ctx = { false, false };
uint32_t invalid; int ret = 0;
mutex_lock(&process_info->notifier_lock);
/* Move all invalidated BOs to the userptr_inval_list */
list_for_each_entry_safe(mem, tmp_mem,
&process_info->userptr_valid_list,
validate_list) if (mem->invalid)
list_move_tail(&mem->validate_list,
&process_info->userptr_inval_list);
/* Go through userptr_inval_list and update any invalid user_pages */
list_for_each_entry(mem, &process_info->userptr_inval_list,
validate_list) {
invalid = mem->invalid; if (!invalid) /* BO hasn't been invalidated since the last * revalidation attempt. Keep its page list.
*/ continue;
/* BO reservations and getting user pages (hmm_range_fault) * must happen outside the notifier lock
*/
mutex_unlock(&process_info->notifier_lock);
/* Move the BO to system (CPU) domain if necessary to unmap * and free the SG table
*/ if (bo->tbo.resource->mem_type != TTM_PL_SYSTEM) { if (amdgpu_bo_reserve(bo, true)) return -EAGAIN;
amdgpu_bo_placement_from_domain(bo, AMDGPU_GEM_DOMAIN_CPU);
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx);
amdgpu_bo_unreserve(bo); if (ret) {
pr_err("%s: Failed to invalidate userptr BO\n",
__func__); return -EAGAIN;
}
}
/* Get updated user pages */
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages,
&mem->range); if (ret) {
pr_debug("Failed %d to get user pages\n", ret);
/* Return -EFAULT bad address error as success. It will * fail later with a VM fault if the GPU tries to access * it. Better than hanging indefinitely with stalled * user mode queues. * * Return other error -EBUSY or -ENOMEM to retry restore
*/ if (ret != -EFAULT) return ret;
/* If applications unmap memory before destroying the userptr * from the KFD, trigger a segmentation fault in VM debug mode.
*/ if (amdgpu_ttm_adev(bo->tbo.bdev)->debug_vm_userptr) { struct kfd_process *p;
pr_err("Pid %d unmapped memory before destroying userptr at GPU addr 0x%llx\n",
pid_nr(process_info->pid), mem->va);
// Send GPU VM fault to user space
p = kfd_lookup_process_by_pid(process_info->pid); if (p) {
kfd_signal_vm_fault_event_with_userptr(p, mem->va);
kfd_unref_process(p);
}
}
ret = 0;
}
mutex_lock(&process_info->notifier_lock);
/* Mark the BO as valid unless it was invalidated * again concurrently.
*/ if (mem->invalid != invalid) {
ret = -EAGAIN; goto unlock_out;
} /* set mem valid if mem has hmm range associated */ if (mem->range)
mem->invalid = 0;
}
/* Validate the BO if we got user pages */ if (bo->tbo.ttm->pages[0]) {
amdgpu_bo_placement_from_domain(bo, mem->domain);
ret = ttm_bo_validate(&bo->tbo, &bo->placement, &ctx); if (ret) {
pr_err("%s: failed to validate BO\n", __func__); goto unreserve_out;
}
}
/* Update mapping. If the BO was not validated * (because we couldn't get user pages), this will * clear the page table entries, which will result in * VM faults if the GPU tries to access the invalid * memory.
*/
list_for_each_entry(attachment, &mem->attachments, list) { if (!attachment->is_mapped) continue;
kfd_mem_dmaunmap_attachment(mem, attachment);
ret = update_gpuvm_pte(mem, attachment, &sync); if (ret) {
pr_err("%s: update PTE failed\n", __func__); /* make sure this gets validated again */
mutex_lock(&process_info->notifier_lock);
mem->invalid++;
mutex_unlock(&process_info->notifier_lock); goto unreserve_out;
}
}
}
/* Update page directories */
ret = process_update_pds(process_info, &sync);
/* Confirm that all user pages are valid while holding the notifier lock * * Moves valid BOs from the userptr_inval_list back to userptr_val_list.
*/ staticint confirm_valid_user_pages_locked(struct amdkfd_process_info *process_info)
{ struct kgd_mem *mem, *tmp_mem; int ret = 0;
/* Worker callback to restore evicted userptr BOs * * Tries to update and validate all userptr BOs. If successful and no * concurrent evictions happened, the queues are restarted. Otherwise, * reschedule for another attempt later.
*/ staticvoid amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
{ struct delayed_work *dwork = to_delayed_work(work); struct amdkfd_process_info *process_info =
container_of(dwork, struct amdkfd_process_info,
restore_userptr_work); struct task_struct *usertask; struct mm_struct *mm;
uint32_t evicted_bos;
mutex_lock(&process_info->notifier_lock);
evicted_bos = process_info->evicted_bos;
mutex_unlock(&process_info->notifier_lock); if (!evicted_bos) return;
/* Reference task and mm in case of concurrent process termination */
usertask = get_pid_task(process_info->pid, PIDTYPE_PID); if (!usertask) return;
mm = get_task_mm(usertask); if (!mm) {
put_task_struct(usertask); return;
}
mutex_lock(&process_info->lock);
if (update_invalid_user_pages(process_info, mm)) goto unlock_out; /* userptr_inval_list can be empty if all evicted userptr BOs * have been freed. In that case there is nothing to validate * and we can just restart the queues.
*/ if (!list_empty(&process_info->userptr_inval_list)) { if (validate_invalid_user_pages(process_info)) goto unlock_out;
} /* Final check for concurrent evicton and atomic update. If * another eviction happens after successful update, it will * be a first eviction that calls quiesce_mm. The eviction * reference counting inside KFD will handle this case.
*/
mutex_lock(&process_info->notifier_lock); if (process_info->evicted_bos != evicted_bos) goto unlock_notifier_out;
if (kgd2kfd_resume_mm(mm)) {
pr_err("%s: Failed to resume KFD\n", __func__); /* No recovery from this failure. Probably the CP is * hanging. No point trying again.
*/
}
/* If validation failed, reschedule another attempt */ if (evicted_bos) {
queue_delayed_work(system_freezable_wq,
&process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
/* If we're replacing an unsignaled eviction fence, that fence will * never be signaled, and if anyone is still waiting on that fence, * they will hang forever. This should never happen. We should only * replace the fence in restore_work that only gets scheduled after * eviction work signaled the fence.
*/
WARN_ONCE(!dma_fence_is_signaled(old_ef), "Replacing unsignaled eviction fence");
dma_fence_put(old_ef);
}
/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for the given * KFD process identified by process_info * * @process_info: amdkfd_process_info of the KFD process * * After memory eviction, restore thread calls this function. The function * should be called when the Process is still valid. BO restore involves - * * 1. Release old eviction fence and create new one * 2. Get two copies of PD BO list from all the VMs. Keep one copy as pd_list. * 3 Use the second PD list and kfd_bo_list to create a list (ctx.list) of * BOs that need to be reserved. * 4. Reserve all the BOs * 5. Validate of PD and PT BOs. * 6. Validate all KFD BOs using kfd_bo_list and Map them and add new fence * 7. Add fence to all PD and PT BOs. * 8. Unreserve all BOs
*/ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu **ef)
{ struct amdkfd_process_info *process_info = info; struct amdgpu_vm *peer_vm; struct kgd_mem *mem; struct list_head duplicate_save; struct amdgpu_sync sync_obj; unsignedlong failed_size = 0; unsignedlong total_size = 0; struct drm_exec exec; int ret;
INIT_LIST_HEAD(&duplicate_save);
mutex_lock(&process_info->lock);
drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
drm_exec_until_all_locked(&exec) {
list_for_each_entry(peer_vm, &process_info->vm_list_head,
vm_list_node) {
ret = amdgpu_vm_lock_pd(peer_vm, &exec, 2);
drm_exec_retry_on_contention(&exec); if (unlikely(ret)) {
pr_err("Locking VM PD failed, ret: %d\n", ret); goto ttm_reserve_fail;
}
}
/* Reserve all BOs and page tables/directory. Add all BOs from * kfd_bo_list to ctx.list
*/
list_for_each_entry(mem, &process_info->kfd_bo_list,
validate_list) { struct drm_gem_object *gobj;
/* Sync with fences on all the page tables. They implicitly depend on any * move fences from amdgpu_vm_handle_moved above.
*/
ret = process_sync_pds_resv(process_info, &sync_obj); if (ret) {
pr_debug("Memory eviction: Failed to sync to PD BO moving fence. Try again\n"); goto validate_map_fail;
}
/* Wait for validate and PT updates to finish */
amdgpu_sync_wait(&sync_obj, false);
/* The old eviction fence may be unsignaled if restore happens * after a GPU reset or suspend/resume. Keep the old fence in that * case. Otherwise release the old eviction fence and create new * one, because fence only goes from unsignaled to signaled once * and cannot be reused. Use context and mm from the old fence. * * If an old eviction fence signals after this check, that's OK. * Anyone signaling an eviction fence must stop the queues first * and schedule another restore worker.
*/ if (dma_fence_is_signaled(&process_info->eviction_fence->base)) { struct amdgpu_amdkfd_fence *new_fence =
amdgpu_amdkfd_fence_create(
process_info->eviction_fence->base.context,
process_info->eviction_fence->mm,
NULL);
if (!new_fence) {
pr_err("Failed to create eviction fence\n");
ret = -ENOMEM; goto validate_map_fail;
}
dma_fence_put(&process_info->eviction_fence->base);
process_info->eviction_fence = new_fence;
replace_eviction_fence(ef, dma_fence_get(&new_fence->base));
} else {
WARN_ONCE(*ef != &process_info->eviction_fence->base, "KFD eviction fence doesn't match KGD process_info");
}
/* Attach new eviction fence to all BOs except pinned ones */
list_for_each_entry(mem, &process_info->kfd_bo_list, validate_list) { if (mem->bo->tbo.pin_count) continue;
/* Validate gws bo the first time it is added to process */
mutex_lock(&(*mem)->process_info->lock);
ret = amdgpu_bo_reserve(gws_bo, false); if (unlikely(ret)) {
pr_err("Reserve gws bo failed %d\n", ret); goto bo_reservation_failure;
}
ret = amdgpu_amdkfd_bo_validate(gws_bo, AMDGPU_GEM_DOMAIN_GWS, true); if (ret) {
pr_err("GWS BO validate failed %d\n", ret); goto bo_validation_failure;
} /* GWS resource is shared b/t amdgpu and amdkfd * Add process eviction fence to bo so they can * evict each other.
*/
ret = dma_resv_reserve_fences(gws_bo->tbo.base.resv, 1); if (ret) goto reserve_shared_fail;
dma_resv_add_fence(gws_bo->tbo.base.resv,
&process_info->eviction_fence->base,
DMA_RESV_USAGE_BOOKKEEP);
amdgpu_bo_unreserve(gws_bo);
mutex_unlock(&(*mem)->process_info->lock);
int kfd_debugfs_kfd_mem_limits(struct seq_file *m, void *data)
{
spin_lock(&kfd_mem_limit.mem_limit_lock);
seq_printf(m, "System mem used %lldM out of %lluM\n",
(kfd_mem_limit.system_mem_used >> 20),
(kfd_mem_limit.max_system_mem_limit >> 20));
seq_printf(m, "TTM mem used %lldM out of %lluM\n",
(kfd_mem_limit.ttm_mem_used >> 20),
(kfd_mem_limit.max_ttm_mem_limit >> 20));
spin_unlock(&kfd_mem_limit.mem_limit_lock);
return 0;
}
#endif
Messung V0.5 in Prozent
¤ Dauer der Verarbeitung: 0.31 Sekunden
(vorverarbeitet am 2026-04-29)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.