module_param(reset_on_lockup, int, 0444);
MODULE_PARM_DESC(reset_on_lockup, "Do device reset on lockup (0 = no, 1 = yes, default yes)");
module_param(memory_scrub, int, 0444);
MODULE_PARM_DESC(memory_scrub, "Scrub device memory in various states (0 = no, 1 = yes, default no)");
module_param(boot_error_status_mask, ulong, 0444);
MODULE_PARM_DESC(boot_error_status_mask, "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
/* * get_asic_type - translate device id to asic type * * @hdev: pointer to habanalabs device structure. * * Translate device id and revision id to asic type. * In case of unidentified device, return -1
*/ staticenum hl_asic_type get_asic_type(struct hl_device *hdev)
{ struct pci_dev *pdev = hdev->pdev; enum hl_asic_type asic_type = ASIC_INVALID;
switch (pdev->device) { case PCI_IDS_GOYA:
asic_type = ASIC_GOYA; break; case PCI_IDS_GAUDI:
asic_type = ASIC_GAUDI; break; case PCI_IDS_GAUDI_SEC:
asic_type = ASIC_GAUDI_SEC; break; case PCI_IDS_GAUDI2: switch (pdev->revision) { case REV_ID_A:
asic_type = ASIC_GAUDI2; break; case REV_ID_B:
asic_type = ASIC_GAUDI2B; break; case REV_ID_C:
asic_type = ASIC_GAUDI2C; break; case REV_ID_D:
asic_type = ASIC_GAUDI2D; break; default: break;
} break; default: break;
}
/* * hl_device_open() - open function for habanalabs device. * @ddev: pointer to DRM device structure. * @file: pointer to DRM file private data structure. * * Called when process opens an habanalabs device.
*/ int hl_device_open(struct drm_device *ddev, struct drm_file *file_priv)
{ struct hl_device *hdev = to_hl_device(ddev); enum hl_device_status status; struct hl_fpriv *hpriv; int rc;
hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL); if (!hpriv) return -ENOMEM;
if (!hl_device_operational(hdev, &status)) {
dev_dbg_ratelimited(hdev->dev, "Can't open %s because it is %s\n",
dev_name(hdev->dev), hdev->status[status]);
if (status == HL_DEVICE_STATUS_IN_RESET ||
status == HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE)
rc = -EAGAIN; else
rc = -EPERM;
goto out_err;
}
if (hdev->is_in_dram_scrub) {
dev_dbg_ratelimited(hdev->dev, "Can't open %s during dram scrub\n",
dev_name(hdev->dev));
rc = -EAGAIN; goto out_err;
}
if (hdev->compute_ctx_in_release) {
dev_dbg_ratelimited(hdev->dev, "Can't open %s because another user is still releasing it\n",
dev_name(hdev->dev));
rc = -EAGAIN; goto out_err;
}
if (hdev->is_compute_ctx_active) {
dev_dbg_ratelimited(hdev->dev, "Can't open %s because another user is working on it\n",
dev_name(hdev->dev));
rc = -EBUSY; goto out_err;
}
rc = hl_ctx_create(hdev, hpriv); if (rc) {
dev_err(hdev->dev, "Failed to create context %d\n", rc); goto out_err;
}
hpriv = kzalloc(sizeof(*hpriv), GFP_KERNEL); if (!hpriv) return -ENOMEM;
/* Prevent other routines from reading partial hpriv data by * initializing hpriv fields before inserting it to the list
*/
hpriv->hdev = hdev;
filp->private_data = hpriv;
if (!hl_ctrl_device_operational(hdev, NULL)) {
dev_dbg_ratelimited(hdev->dev_ctrl, "Can't open %s because it is disabled\n",
dev_name(hdev->dev_ctrl));
rc = -EPERM; goto out_err;
}
staticvoid fixup_device_params_per_asic(struct hl_device *hdev, int timeout)
{ switch (hdev->asic_type) { case ASIC_GAUDI: case ASIC_GAUDI_SEC: /* If user didn't request a different timeout than the default one, we have * a different default timeout for Gaudi
*/ if (timeout == HL_DEFAULT_TIMEOUT_LOCKED)
hdev->timeout_jiffies = secs_to_jiffies(GAUDI_DEFAULT_TIMEOUT_LOCKED);
hdev->reset_upon_device_release = 0; break;
case ASIC_GOYA:
hdev->reset_upon_device_release = 0; break;
/* Enable only after the initialization of the device */
hdev->disabled = true;
if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) &&
(hdev->fw_components & ~FW_TYPE_PREBOOT_CPU)) {
pr_err("Preboot must be set along with other components"); return -EINVAL;
}
/* If CPU queues not enabled, no way to do heartbeat */ if (!hdev->cpu_queues_enable)
hdev->heartbeat = 0;
fixup_device_params_per_asic(hdev, tmp_timeout);
return 0;
}
staticint allocate_device_id(struct hl_device *hdev)
{ int id;
mutex_lock(&hl_devs_idr_lock);
id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
mutex_unlock(&hl_devs_idr_lock);
if (id < 0) { if (id == -ENOSPC)
pr_err("too many devices in the system\n"); return -EBUSY;
}
hdev->id = id;
/* * Firstly initialized with the internal device ID. * Will be updated later after the DRM device registration to hold the minor ID.
*/
hdev->cdev_idx = hdev->id;
return 0;
}
/** * create_hdev - create habanalabs device instance * * @dev: will hold the pointer to the new habanalabs device structure * @pdev: pointer to the pci device * * Allocate memory for habanalabs device and initialize basic fields * Identify the ASIC type * Allocate ID (minor) for the device (only for real devices)
*/ staticint create_hdev(struct hl_device **dev, struct pci_dev *pdev)
{ struct hl_device *hdev; int rc;
/* Will be NULL in case of simulator device */
hdev->pdev = pdev;
/* Assign status description string */
strscpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
strscpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
strscpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
strscpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION], "in device creation", HL_STR_MAX);
strscpy(hdev->status[HL_DEVICE_STATUS_IN_RESET_AFTER_DEVICE_RELEASE], "in reset after device release", HL_STR_MAX);
/* First, we must find out which ASIC are we handling. This is needed * to configure the behavior of the driver (kernel parameters)
*/
hdev->asic_type = get_asic_type(hdev); if (hdev->asic_type == ASIC_INVALID) {
dev_err(&pdev->dev, "Unsupported ASIC\n");
rc = -ENODEV; goto out_err;
}
copy_kernel_module_params_to_device(hdev);
set_driver_behavior_per_device(hdev);
fixup_device_params(hdev);
rc = allocate_device_id(hdev); if (rc) goto out_err;
*dev = hdev;
return 0;
out_err: return rc;
}
/* * destroy_hdev - destroy habanalabs device instance * * @dev: pointer to the habanalabs device structure *
*/ staticvoid destroy_hdev(struct hl_device *hdev)
{ /* Remove device from the device list */
mutex_lock(&hl_devs_idr_lock);
idr_remove(&hl_devs_idr, hdev->id);
mutex_unlock(&hl_devs_idr_lock);
if (!hdev) {
pr_err("device pointer is NULL in resume\n"); return 0;
}
return hl_device_resume(hdev);
}
/** * hl_pci_probe - probe PCI habanalabs devices * * @pdev: pointer to pci device * @id: pointer to pci device id structure * * Standard PCI probe function for habanalabs device. * Create a new habanalabs device and initialize it according to the * device's type
*/ staticint hl_pci_probe(struct pci_dev *pdev, conststruct pci_device_id *id)
{ struct hl_device *hdev; int rc;
/** * hl_pci_err_detected - a PCI bus error detected on this device * * @pdev: pointer to pci device * @state: PCI error type * * Called by the PCI subsystem whenever a non-correctable * PCI bus error is detected
*/ static pci_ers_result_t
hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
{ struct hl_device *hdev = pci_get_drvdata(pdev); enum pci_ers_result result;
switch (state) { case pci_channel_io_normal:
dev_warn(hdev->dev, "PCI normal state error detected\n"); return PCI_ERS_RESULT_CAN_RECOVER;
case pci_channel_io_frozen:
dev_warn(hdev->dev, "PCI frozen state error detected\n");
result = PCI_ERS_RESULT_NEED_RESET; break;
case pci_channel_io_perm_failure:
dev_warn(hdev->dev, "PCI failure state error detected\n");
result = PCI_ERS_RESULT_DISCONNECT; break;
/** * hl_pci_err_resume - resume after a PCI slot reset * * @pdev: pointer to pci device *
*/ staticvoid hl_pci_err_resume(struct pci_dev *pdev)
{ struct hl_device *hdev = pci_get_drvdata(pdev);
dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
hl_device_resume(hdev);
}
/** * hl_pci_err_slot_reset - a PCI slot reset has just happened * * @pdev: pointer to pci device * * Determine if the driver can recover from the PCI slot reset
*/ static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
{ struct hl_device *hdev = pci_get_drvdata(pdev);
/* * Schedule a thread to trigger hard reset. * The reason for this handler, is for rare cases where the driver is up * and FLR occurs. This is valid only when working with no VM, so FW handles FLR * and resets the device. FW will go back preboot stage, so driver needs to perform * hard reset in order to load FW fit again.
*/
flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.