// SPDX-License-Identifier: GPL-2.0 OR MIT /* * Copyright 2014-2022 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. *
*/
staticinline enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
{ if (type == KFD_QUEUE_TYPE_SDMA || type == KFD_QUEUE_TYPE_SDMA_XGMI) return KFD_MQD_TYPE_SDMA; return KFD_MQD_TYPE_CP;
}
staticbool is_pipe_enabled(struct device_queue_manager *dqm, int mec, int pipe)
{ int i; int pipe_offset = (mec * dqm->dev->kfd->shared_resources.num_pipe_per_mec
+ pipe) * dqm->dev->kfd->shared_resources.num_queue_per_pipe;
/* queue is available for KFD usage if bit is 1 */ for (i = 0; i < dqm->dev->kfd->shared_resources.num_queue_per_pipe; ++i) if (test_bit(pipe_offset + i,
dqm->dev->kfd->shared_resources.cp_queue_bitmap)) returntrue; returnfalse;
}
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->add_hw_queue(&adev->mes, &queue_input);
amdgpu_mes_unlock(&adev->mes);
up_read(&adev->reset_domain->sem); if (r) {
dev_err(adev->dev, "failed to add hardware queue to MES, doorbell=0x%x\n",
q->properties.doorbell_off);
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
}
amdgpu_mes_lock(&adev->mes);
r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input);
amdgpu_mes_unlock(&adev->mes);
up_read(&adev->reset_domain->sem);
if (r) {
dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
q->properties.doorbell_off);
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
}
list_for_each_entry(cur, &dqm->queues, list) {
qpd = cur->qpd;
list_for_each_entry(q, &qpd->queues_list, list) { if (!q->properties.is_active) continue;
retval = add_queue_mes(dqm, q, qpd); if (retval) {
dev_err(dev, "%s: Failed to add queue %d for dev %d",
__func__,
q->properties.queue_id,
dqm->dev->id); return retval;
}
}
}
return retval;
}
staticint suspend_all_queues_mes(struct device_queue_manager *dqm)
{ struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; int r = 0;
if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO;
r = amdgpu_mes_suspend(adev);
up_read(&adev->reset_domain->sem);
if (r) {
dev_err(adev->dev, "failed to suspend gangs from MES\n");
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
}
return r;
}
staticint resume_all_queues_mes(struct device_queue_manager *dqm)
{ struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; int r = 0;
if (!down_read_trylock(&adev->reset_domain->sem)) return -EIO;
r = amdgpu_mes_resume(adev);
up_read(&adev->reset_domain->sem);
if (r) {
dev_err(adev->dev, "failed to resume gangs from MES\n");
dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n");
kfd_hws_hang(dqm);
}
if (q->properties.is_gws) {
dqm->gws_queue_count--;
qpd->mapped_gws_queue = false;
}
}
/* * Allocate a doorbell ID to this queue. * If doorbell_id is passed in, make sure requested ID is valid then allocate it.
*/ staticint allocate_doorbell(struct qcm_process_device *qpd, struct queue *q,
uint32_t const *restore_id)
{ struct kfd_node *dev = qpd->dqm->dev;
if (!KFD_IS_SOC15(dev)) { /* On pre-SOC15 chips we need to use the queue ID to * preserve the user mode ABI.
*/
if (restore_id && *restore_id != q->properties.queue_id) return -EINVAL;
q->doorbell_id = q->properties.queue_id;
} elseif (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { /* For SDMA queues on SOC15 with 8-byte doorbell, use static * doorbell assignments based on the engine and queue id. * The doobell index distance between RLC (2*i) and (2*i+1) * for a SDMA engine is 512.
*/
/* * q->properties.sdma_engine_id corresponds to the virtual * sdma engine number. However, for doorbell allocation, * we need the physical sdma engine id in order to get the * correct doorbell offset.
*/
uint32_t valid_id = idx_offset[qpd->dqm->dev->node_id *
get_num_all_sdma_engines(qpd->dqm) +
q->properties.sdma_engine_id]
+ (q->properties.sdma_queue_id & 1)
* KFD_QUEUE_DOORBELL_MIRROR_OFFSET
+ (q->properties.sdma_queue_id >> 1);
if (restore_id && *restore_id != valid_id) return -EINVAL;
q->doorbell_id = valid_id;
} else { /* For CP queues on SOC15 */ if (restore_id) { /* make sure that ID is free */ if (__test_and_set_bit(*restore_id, qpd->doorbell_bitmap)) return -EINVAL;
q->doorbell_id = *restore_id;
} else { /* or reserve a free doorbell ID */ unsignedint found;
if (KFD_IS_SOC15(dqm->dev) && dqm->dev->kfd->cwsr_enabled)
program_trap_handler_settings(dqm, qpd);
/* qpd->page_table_base is set earlier when register_process() * is called, i.e. when the first queue is created.
*/
dqm->dev->kfd2kgd->set_vm_context_page_table_base(dqm->dev->adev,
qpd->vmid,
qpd->page_table_base); /* invalidate the VM context after pasid and vmid mapping is set up */
kfd_flush_tlb(qpd_to_pdd(qpd), TLB_FLUSH_LEGACY);
if (dqm->dev->kfd2kgd->set_scratch_backing_va)
dqm->dev->kfd2kgd->set_scratch_backing_va(dqm->dev->adev,
qpd->sh_hidden_private_base, qpd->vmid);
/* On GFX v7, CP doesn't flush TC at dequeue */ if (q->device->adev->asic_type == CHIP_HAWAII) if (flush_texture_cache_nocpsch(q->device, qpd))
dev_err(dev, "Failed to flush TC\n");
if (dqm->total_queue_count >= max_num_of_queues_per_device) {
pr_warn("Can't create new usermode queue because %d queues were already created\n",
dqm->total_queue_count);
retval = -EPERM; goto out_unlock;
}
if (list_empty(&qpd->queues_list)) {
retval = allocate_vmid(dqm, qpd, q); if (retval) goto out_unlock;
}
q->properties.vmid = qpd->vmid; /* * Eviction state logic: mark all queues as evicted, even ones * not currently active. Restoring inactive queues later only * updates the is_evicted flag but is a no-op otherwise.
*/
q->properties.is_evicted = !!qpd->evicted;
if (q->properties.is_active) { if (!dqm->sched_running) {
WARN_ONCE(1, "Load non-HWS mqd while stopped\n"); goto add_queue_to_list;
}
if (WARN(q->process->mm != current->mm, "should only run in user thread"))
retval = -EFAULT; else
retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
q->queue, &q->properties, current->mm); if (retval) goto out_free_mqd;
}
add_queue_to_list:
list_add(&q->list, &qpd->queues_list);
qpd->queue_count++; if (q->properties.is_active)
increment_queue_count(dqm, qpd, q);
/* * Unconditionally increment this counter, regardless of the queue's * type or whether the queue is active.
*/
dqm->total_queue_count++;
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count); goto out_unlock;
/* Access to DQM has to be locked before calling destroy_queue_nocpsch_locked * to avoid asynchronized access
*/ staticint destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q)
{ int retval; struct mqd_manager *mqd_mgr;
list_del(&q->list); if (list_empty(&qpd->queues_list)) { if (qpd->reset_wavefronts) {
pr_warn("Resetting wave fronts (nocpsch) on dev %p\n",
dqm->dev); /* dbgdev_wave_reset_wavefronts has to be called before * deallocate_vmid(), i.e. when vmid is still in use.
*/
dbgdev_wave_reset_wavefronts(dqm->dev,
qpd->pqm->process);
qpd->reset_wavefronts = false;
}
deallocate_vmid(dqm, qpd, q);
}
qpd->queue_count--; if (q->properties.is_active)
decrement_queue_count(dqm, qpd, q);
/* Save previous activity state for counters */
prev_active = q->properties.is_active;
/* Make sure the queue is unmapped before updating the MQD */ if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) { if (!dqm->dev->kfd->shared_resources.enable_mes)
retval = unmap_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); elseif (prev_active)
retval = remove_queue_mes(dqm, q, &pdd->qpd);
/* queue is reset so inaccessable */ if (pdd->has_reset_queue) {
retval = -EACCES; goto out_unlock;
}
/* * check active state vs. the previous state and modify * counter accordingly. map_queues_cpsch uses the * dqm->active_queue_count to determine whether a new runlist must be * uploaded.
*/ if (q->properties.is_active && !prev_active) {
increment_queue_count(dqm, &pdd->qpd, q);
} elseif (!q->properties.is_active && prev_active) {
decrement_queue_count(dqm, &pdd->qpd, q);
} elseif (q->gws && !q->properties.is_gws) { if (q->properties.is_active) {
dqm->gws_queue_count++;
pdd->qpd.mapped_gws_queue = true;
}
q->properties.is_gws = true;
} elseif (!q->gws && q->properties.is_gws) { if (q->properties.is_active) {
dqm->gws_queue_count--;
pdd->qpd.mapped_gws_queue = false;
}
q->properties.is_gws = false;
}
if (dqm->sched_policy != KFD_SCHED_POLICY_NO_HWS) { if (!dqm->dev->kfd->shared_resources.enable_mes)
retval = map_queues_cpsch(dqm); elseif (q->properties.is_active)
retval = add_queue_mes(dqm, q, &pdd->qpd);
} elseif (q->properties.is_active &&
(q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { if (WARN(q->process->mm != current->mm, "should only run in user thread"))
retval = -EFAULT; else
retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd,
q->pipe, q->queue,
&q->properties, current->mm);
}
out_unlock:
dqm_unlock(dqm); return retval;
}
/* suspend_single_queue does not lock the dqm like the * evict_process_queues_cpsch or evict_process_queues_nocpsch. You should * lock the dqm before calling, and unlock after calling. * * The reason we don't lock the dqm is because this function may be * called on multiple queues in a loop, so rather than locking/unlocking * multiple times, we will just keep the dqm locked for all of the calls.
*/ staticint suspend_single_queue(struct device_queue_manager *dqm, struct kfd_process_device *pdd, struct queue *q)
{ bool is_new;
if (q->properties.is_suspended) return 0;
pr_debug("Suspending process pid %d queue [%i]\n",
pdd->process->lead_thread->pid,
q->properties.queue_id);
if (is_new || q->properties.is_being_destroyed) {
pr_debug("Suspend: skip %s queue id %i\n",
is_new ? "new" : "destroyed",
q->properties.queue_id); return -EBUSY;
}
q->properties.is_suspended = true; if (q->properties.is_active) { if (dqm->dev->kfd->shared_resources.enable_mes) { int r = remove_queue_mes(dqm, q, &pdd->qpd);
/* resume_single_queue does not lock the dqm like the functions * restore_process_queues_cpsch or restore_process_queues_nocpsch. You should * lock the dqm before calling, and unlock after calling. * * The reason we don't lock the dqm is because this function may be * called on multiple queues in a loop, so rather than locking/unlocking * multiple times, we will just keep the dqm locked for all of the calls.
*/ staticint resume_single_queue(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q)
{ struct kfd_process_device *pdd;
if (!q->properties.is_suspended) return 0;
pdd = qpd_to_pdd(qpd);
pr_debug("Restoring from suspend process pid %d queue [%i]\n",
pdd->process->lead_thread->pid,
q->properties.queue_id);
q->properties.is_suspended = false;
if (QUEUE_IS_ACTIVE(q->properties)) { if (dqm->dev->kfd->shared_resources.enable_mes) { int r = add_queue_mes(dqm, q, &pdd->qpd);
dqm_lock(dqm); if (qpd->evicted++ > 0) /* already evicted, do nothing */ goto out;
pdd = qpd_to_pdd(qpd);
pr_debug_ratelimited("Evicting process pid %d queues\n",
pdd->process->lead_thread->pid);
pdd->last_evict_timestamp = get_jiffies_64(); /* Mark all queues as evicted. Deactivate all active queues on * the qpd.
*/
list_for_each_entry(q, &qpd->queues_list, list) {
q->properties.is_evicted = true; if (!q->properties.is_active) continue;
if (WARN_ONCE(!dqm->sched_running, "Evict when stopped\n")) continue;
retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
(dqm->dev->kfd->cwsr_enabled ?
KFD_PREEMPT_TYPE_WAVEFRONT_SAVE :
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN),
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue); if (retval && !ret) /* Return the first error, but keep going to * maintain a consistent eviction state
*/
ret = retval;
}
dqm_lock(dqm); if (qpd->evicted++ > 0) /* already evicted, do nothing */ goto out;
pdd = qpd_to_pdd(qpd);
/* The debugger creates processes that temporarily have not acquired * all VMs for all devices and has no VMs itself. * Skip queue eviction on process eviction.
*/ if (!pdd->drm_priv) goto out;
pr_debug_ratelimited("Evicting process pid %d queues\n",
pdd->process->lead_thread->pid);
if (dqm->dev->kfd->shared_resources.enable_mes) {
pdd->last_evict_timestamp = get_jiffies_64();
retval = suspend_all_queues_mes(dqm); if (retval) {
dev_err(dev, "Suspending all queues failed"); goto out;
}
}
/* Mark all queues as evicted. Deactivate all active queues on * the qpd.
*/
list_for_each_entry(q, &qpd->queues_list, list) {
q->properties.is_evicted = true; if (!q->properties.is_active) continue;
dqm_lock(dqm); if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */ goto out; if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
qpd->evicted--; goto out;
}
pr_debug_ratelimited("Restoring process pid %d queues\n",
pdd->process->lead_thread->pid);
/* Update PD Base in QPD */
qpd->page_table_base = pd_base;
pr_debug("Updated PD address to 0x%llx\n", pd_base);
if (!list_empty(&qpd->queues_list)) {
dqm->dev->kfd2kgd->set_vm_context_page_table_base(
dqm->dev->adev,
qpd->vmid,
qpd->page_table_base);
kfd_flush_tlb(pdd, TLB_FLUSH_LEGACY);
}
/* Take a safe reference to the mm_struct, which may otherwise * disappear even while the kfd_process is still referenced.
*/
mm = get_task_mm(pdd->process->lead_thread); if (!mm) {
ret = -EFAULT; goto out;
}
/* Remove the eviction flags. Activate queues that are not * inactive for other reasons.
*/
list_for_each_entry(q, &qpd->queues_list, list) {
q->properties.is_evicted = false; if (!QUEUE_IS_ACTIVE(q->properties)) continue;
dqm_lock(dqm); if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */ goto out; if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
qpd->evicted--; goto out;
}
/* The debugger creates processes that temporarily have not acquired * all VMs for all devices and has no VMs itself. * Skip queue restore on process restore.
*/ if (!pdd->drm_priv) goto vm_not_acquired;
pr_debug_ratelimited("Restoring process pid %d queues\n",
pdd->process->lead_thread->pid);
/* Update PD Base in QPD */
qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
/* activate all active queues on the qpd */
list_for_each_entry(q, &qpd->queues_list, list) {
q->properties.is_evicted = false; if (!QUEUE_IS_ACTIVE(q->properties)) continue;
/* Update PD Base in QPD */
qpd->page_table_base = pd_base;
pr_debug("Updated PD address to 0x%llx\n", pd_base);
retval = dqm->asic_ops.update_qpd(dqm, qpd);
dqm->processes_count++;
dqm_unlock(dqm);
/* Outside the DQM lock because under the DQM lock we can't do * reclaim or take other locks that others hold while reclaiming.
*/
kfd_inc_compute_active(dqm->dev);
pr_debug("qpd->queues_list is %s\n",
list_empty(&qpd->queues_list) ? "empty" : "not empty");
retval = 0;
dqm_lock(dqm);
list_for_each_entry_safe(cur, next, &dqm->queues, list) { if (qpd == cur->qpd) {
list_del(&cur->list);
kfree(cur);
dqm->processes_count--; goto out;
}
} /* qpd not found in dqm list */
retval = 1;
out:
dqm_unlock(dqm);
/* Outside the DQM lock because under the DQM lock we can't do * reclaim or take other locks that others hold while reclaiming.
*/ if (!retval)
kfd_dec_compute_active(dqm->dev);
if (q->properties.type == KFD_QUEUE_TYPE_SDMA) { if (bitmap_empty(dqm->sdma_bitmap, get_num_sdma_queues(dqm))) {
dev_warn(dev, "No more SDMA queue to allocate (%d total queues)\n",
get_num_sdma_queues(dqm)); return -ENOMEM;
}
if (restore_sdma_id) { /* Re-use existing sdma_id */ if (!test_bit(*restore_sdma_id, dqm->sdma_bitmap)) {
dev_err(dev, "SDMA queue already in use\n"); return -EBUSY;
}
clear_bit(*restore_sdma_id, dqm->sdma_bitmap);
q->sdma_id = *restore_sdma_id;
} else { /* Find first available sdma_id */
bit = find_first_bit(dqm->sdma_bitmap,
get_num_sdma_queues(dqm));
clear_bit(bit, dqm->sdma_bitmap);
q->sdma_id = bit;
}
q->properties.sdma_engine_id =
q->sdma_id % kfd_get_num_sdma_engines(dqm->dev);
q->properties.sdma_queue_id = q->sdma_id /
kfd_get_num_sdma_engines(dqm->dev);
} elseif (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) { if (bitmap_empty(dqm->xgmi_sdma_bitmap, get_num_xgmi_sdma_queues(dqm))) {
dev_warn(dev, "No more XGMI SDMA queue to allocate (%d total queues)\n",
get_num_xgmi_sdma_queues(dqm)); return -ENOMEM;
} if (restore_sdma_id) { /* Re-use existing sdma_id */ if (!test_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap)) {
dev_err(dev, "SDMA queue already in use\n"); return -EBUSY;
}
clear_bit(*restore_sdma_id, dqm->xgmi_sdma_bitmap);
q->sdma_id = *restore_sdma_id;
} else {
bit = find_first_bit(dqm->xgmi_sdma_bitmap,
get_num_xgmi_sdma_queues(dqm));
clear_bit(bit, dqm->xgmi_sdma_bitmap);
q->sdma_id = bit;
} /* sdma_engine_id is sdma id including * both PCIe-optimized SDMAs and XGMI- * optimized SDMAs. The calculation below * assumes the first N engines are always * PCIe-optimized ones
*/
q->properties.sdma_engine_id =
kfd_get_num_sdma_engines(dqm->dev) +
q->sdma_id % kfd_get_num_xgmi_sdma_engines(dqm->dev);
q->properties.sdma_queue_id = q->sdma_id /
kfd_get_num_xgmi_sdma_engines(dqm->dev);
} elseif (q->properties.type == KFD_QUEUE_TYPE_SDMA_BY_ENG_ID) { int i, num_queues, num_engines, eng_offset = 0, start_engine; bool free_bit_found = false, is_xgmi = false;
/* Scan available bit based on target engine ID. */
start_engine = q->properties.sdma_engine_id - eng_offset; for (i = start_engine; i < num_queues; i += num_engines) {
if (!test_bit(i, is_xgmi ? dqm->xgmi_sdma_bitmap : dqm->sdma_bitmap)) continue;
if (!free_bit_found) {
dev_warn(dev, "No more SDMA queue to allocate for target ID %i (%d total queues)\n",
q->properties.sdma_engine_id, num_queues); return -ENOMEM;
}
}
/* * Device Queue Manager implementation for cp scheduler
*/
staticint set_sched_resources(struct device_queue_manager *dqm)
{ int i, mec; struct scheduling_resources res; struct device *dev = dqm->dev->adev->dev;
res.vmid_mask = dqm->dev->compute_vmid_bitmap;
res.queue_mask = 0; for (i = 0; i < AMDGPU_MAX_QUEUES; ++i) {
mec = (i / dqm->dev->kfd->shared_resources.num_queue_per_pipe)
/ dqm->dev->kfd->shared_resources.num_pipe_per_mec;
if (!test_bit(i, dqm->dev->kfd->shared_resources.cp_queue_bitmap)) continue;
/* only acquire queues from the first MEC */ if (mec > 0) continue;
/* This situation may be hit in the future if a new HW * generation exposes more than 64 queues. If so, the * definition of res.queue_mask needs updating
*/ if (WARN_ON(i >= (sizeof(res.queue_mask)*8))) {
dev_err(dev, "Invalid queue enabled by amdgpu: %d\n", i); break;
}
/* halt_cpsch: * Unmap queues so the schedule doesn't continue remaining jobs in the queue. * Then set dqm->sched_halt so queues don't map to runlist until unhalt_cpsch * is called.
*/ staticint halt_cpsch(struct device_queue_manager *dqm)
{ int ret = 0;
dqm_lock(dqm); if (!dqm->sched_running) {
dqm_unlock(dqm); return 0;
}
WARN_ONCE(dqm->sched_halt, "Scheduling is already on halt\n");
if (!dqm->is_hws_hang) { if (!dqm->dev->kfd->shared_resources.enable_mes)
ret = unmap_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD, false); else
ret = remove_all_kfd_queues_mes(dqm);
}
dqm->sched_halt = true;
dqm_unlock(dqm);
return ret;
}
/* unhalt_cpsch * Unset dqm->sched_halt and map queues back to runlist
*/ staticint unhalt_cpsch(struct device_queue_manager *dqm)
{ int ret = 0;
dqm_lock(dqm); if (!dqm->sched_running || !dqm->sched_halt) {
WARN_ONCE(!dqm->sched_halt, "Scheduling is not on halt.\n");
dqm_unlock(dqm); return 0;
}
dqm->sched_halt = false; if (!dqm->dev->kfd->shared_resources.enable_mes)
ret = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
0, USE_DEFAULT_GRACE_PERIOD); else
ret = add_all_kfd_queues_mes(dqm);
if (!dqm->dev->kfd->shared_resources.enable_mes)
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD, false); else
remove_all_kfd_queues_mes(dqm);
dqm->sched_running = false;
if (!dqm->dev->kfd->shared_resources.enable_mes)
pm_release_ib(&dqm->packet_mgr);
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); if (!dqm->dev->kfd->shared_resources.enable_mes)
pm_uninit(&dqm->packet_mgr);
kfree(dqm->detect_hang_info);
dqm->detect_hang_info = NULL;
dqm_unlock(dqm);
return 0;
}
staticint create_kernel_queue_cpsch(struct device_queue_manager *dqm, struct kernel_queue *kq, struct qcm_process_device *qpd)
{
dqm_lock(dqm); if (dqm->total_queue_count >= max_num_of_queues_per_device) {
pr_warn("Can't create new kernel queue because %d queues were already created\n",
dqm->total_queue_count);
dqm_unlock(dqm); return -EPERM;
}
/* * Unconditionally increment this counter, regardless of the queue's * type or whether the queue is active.
*/
dqm->total_queue_count++;
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
if (q->properties.type == KFD_QUEUE_TYPE_SDMA ||
q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
q->properties.tba_addr = qpd->tba_addr;
q->properties.tma_addr = qpd->tma_addr;
q->mqd_mem_obj = mqd_mgr->allocate_mqd(mqd_mgr->dev, &q->properties); if (!q->mqd_mem_obj) {
retval = -ENOMEM; goto out_deallocate_doorbell;
}
dqm_lock(dqm); /* * Eviction state logic: mark all queues as evicted, even ones * not currently active. Restoring inactive queues later only * updates the is_evicted flag but is a no-op otherwise.
*/
q->properties.is_evicted = !!qpd->evicted;
q->properties.is_dbg_wa = qpd->pqm->process->debug_trap_enabled &&
kfd_dbg_has_cwsr_workaround(q->device);
while (*fence_addr != fence_value) { /* Fatal err detected, this response won't come */ if (amdgpu_amdkfd_is_fed(dqm->dev->adev)) return -EIO;
if (time_after(jiffies, end_jiffies)) {
dev_err(dev, "qcm fence wait loop timeout expired\n"); /* In HWS case, this is used to halt the driver thread * in order not to mess up CP states before doing * scandumps for FW debugging.
*/ while (halt_if_hws_hang)
schedule();
return -ETIME;
}
schedule();
}
return 0;
}
/* dqm->lock mutex has to be locked before calling this function */ staticint map_queues_cpsch(struct device_queue_manager *dqm)
{ struct device *dev = dqm->dev->adev->dev; int retval;
if (!dqm->sched_running || dqm->sched_halt) return 0; if (dqm->active_queue_count <= 0 || dqm->processes_count <= 0) return 0; if (dqm->active_runlist) return 0;
retval = pm_send_runlist(&dqm->packet_mgr, &dqm->queues);
pr_debug("%s sent runlist\n", __func__); if (retval) {
dev_err(dev, "failed to execute runlist\n"); return retval;
}
dqm->active_runlist = true;
/* either reset failed or we reset an unexpected queue. */ if (queue_addr != q->properties.queue_address) {
r = -ENOTRECOVERABLE; goto reset_fail;
}
set_queue_as_reset(dqm, q, &pdd->qpd);
reset_count++;
}
if (reset_count == dqm->detect_hang_count)
kfd_signal_reset_event(dqm->dev); else
r = -ENOTRECOVERABLE;
reset_fail:
dqm->detect_hang_count = 0;
return r;
}
staticbool sdma_has_hang(struct device_queue_manager *dqm)
{ int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm); int engine_end = engine_start + get_num_all_sdma_engines(dqm); int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine; int i, j;
for (i = engine_start; i < engine_end; i++) { for (j = 0; j < num_queues_per_eng; j++) { if (!dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j)) continue;
staticint reset_hung_queues_sdma(struct device_queue_manager *dqm)
{ int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm); int engine_end = engine_start + get_num_all_sdma_engines(dqm); int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine; int r = 0, i, j;
if (dqm->is_hws_hang) return -EIO;
/* Scan for hung HW queues and reset engine. */
dqm->detect_hang_count = 0; for (i = engine_start; i < engine_end; i++) { for (j = 0; j < num_queues_per_eng; j++) {
uint32_t doorbell_off =
dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j);
if (!doorbell_off) continue;
/* Reset engine and check. */ if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) ||
dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) ||
!set_sdma_queue_as_reset(dqm, doorbell_off)) {
r = -ENOTRECOVERABLE; goto reset_fail;
}
/* Should only expect one queue active per engine */
dqm->detect_hang_count++; break;
}
}
/* Signal process reset */ if (dqm->detect_hang_count)
kfd_signal_reset_event(dqm->dev); else
r = -ENOTRECOVERABLE;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.