/** * DOC: Xe device coredump * * Xe uses dev_coredump infrastructure for exposing the crash errors in a * standardized way. Once a crash occurs, devcoredump exposes a temporary * node under ``/sys/class/devcoredump/devcd<m>/``. The same node is also * accessible in ``/sys/class/drm/card<n>/device/devcoredump/``. The * ``failing_device`` symlink points to the device that crashed and created the * coredump. * * The following characteristics are observed by xe when creating a device * coredump: * * **Snapshot at hang**: * The 'data' file contains a snapshot of the HW and driver states at the time * the hang happened. Due to the driver recovering from resets/crashes, it may * not correspond to the state of the system when the file is read by * userspace. * * **Coredump release**: * After a coredump is generated, it stays in kernel memory until released by * userspace by writing anything to it, or after an internal timer expires. The * exact timeout may vary and should not be relied upon. Example to release * a coredump: * * .. code-block:: shell * * $ > /sys/class/drm/card0/device/devcoredump/data * * **First failure only**: * In general, the first hang is the most critical one since the following * hangs can be a consequence of the initial hang. For this reason a snapshot * is taken only for the first failure. Until the devcoredump is released by * userspace or kernel, all subsequent hangs do not override the snapshot nor * create new ones. Devcoredump has a delayed work queue that will eventually * delete the file node and free all the dump information.
*/
/** * xe_devcoredump_read() - Read data from the Xe device coredump snapshot * @buffer: Destination buffer to copy the coredump data into * @offset: Offset in the coredump data to start reading from * @count: Number of bytes to read * @data: Pointer to the xe_devcoredump structure * @datalen: Length of the data (unused) * * Reads a chunk of the coredump snapshot data into the provided buffer. * If the devcoredump is smaller than 1.5 GB (XE_DEVCOREDUMP_CHUNK_MAX), * it is read directly from a pre-written buffer. For larger devcoredumps, * the pre-written buffer must be periodically repopulated from the snapshot * state due to kmalloc size limitations. * * Return: Number of bytes copied on success, or a negative error code on failure.
*/ static ssize_t xe_devcoredump_read(char *buffer, loff_t offset,
size_t count, void *data, size_t datalen)
{ struct xe_devcoredump *coredump = data; struct xe_devcoredump_snapshot *ss;
ssize_t byte_copied = 0;
u32 chunk_offset;
ssize_t new_chunk_position; bool pm_needed = false; int ret = 0;
if (!coredump) return -ENODEV;
ss = &coredump->snapshot;
/* Ensure delayed work is captured before continuing */
flush_work(&ss->work);
pm_needed = ss->read.size > XE_DEVCOREDUMP_CHUNK_MAX; if (pm_needed)
xe_pm_runtime_get(gt_to_xe(ss->gt));
mutex_lock(&coredump->lock);
if (!ss->read.buffer) {
ret = -ENODEV; goto unlock;
}
/* To prevent stale data on next snapshot, clear everything */
memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
coredump->captured = false;
drm_info(&coredump_to_xe(coredump)->drm, "Xe device coredump has been deleted.\n");
/* * NB: Despite passing a GFP_ flags parameter here, more allocations are done * internally using GFP_KERNEL explicitly. Hence this call must be in the worker * thread and not in the initial capture call.
*/
dev_coredumpm_timeout(gt_to_xe(ss->gt)->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL,
xe_devcoredump_read, xe_devcoredump_free,
XE_COREDUMP_TIMEOUT_JIFFIES);
xe_pm_runtime_get(xe);
/* keep going if fw fails as we still want to save the memory and SW data */
fw_ref = xe_force_wake_get(gt_to_fw(ss->gt), XE_FORCEWAKE_ALL); if (!xe_force_wake_ref_has_domain(fw_ref, XE_FORCEWAKE_ALL))
xe_gt_info(ss->gt, "failed to get forcewake for coredump capture\n");
xe_vm_snapshot_capture_delayed(ss->vm);
xe_guc_exec_queue_snapshot_capture_delayed(ss->ge);
xe_force_wake_put(gt_to_fw(ss->gt), fw_ref);
/** * xe_devcoredump - Take the required snapshots and initialize coredump device. * @q: The faulty xe_exec_queue, where the issue was detected. * @job: The faulty xe_sched_job, where the issue was detected. * @fmt: Printf format + args to describe the reason for the core dump * * This function should be called at the crash time within the serialized * gt_reset. It is skipped if we still have the core dump device available * with the information of the 'first' snapshot.
*/
__printf(3, 4) void xe_devcoredump(struct xe_exec_queue *q, struct xe_sched_job *job, constchar *fmt, ...)
{ struct xe_device *xe = gt_to_xe(q->gt); struct xe_devcoredump *coredump = &xe->devcoredump;
va_list varg;
mutex_lock(&coredump->lock);
if (coredump->captured) {
drm_dbg(&xe->drm, "Multiple hangs are occurring, but only the first snapshot was taken\n");
mutex_unlock(&coredump->lock); return;
}
drm_info(&xe->drm, "Xe device coredump has been created\n");
drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
xe->drm.primary->index);
/** * xe_print_blob_ascii85 - print a BLOB to some useful location in ASCII85 * * The output is split into multiple calls to drm_puts() because some print * targets, e.g. dmesg, cannot handle arbitrarily long lines. These targets may * add newlines, as is the case with dmesg: each drm_puts() call creates a * separate line. * * There is also a scheduler yield call to prevent the 'task has been stuck for * 120s' kernel hang check feature from firing when printing to a slow target * such as dmesg over a serial port. * * @p: the printer object to output to * @prefix: optional prefix to add to output string * @suffix: optional suffix to add at the end. 0 disables it and is * not added to the output, which is useful when using multiple calls * to dump data to @p * @blob: the Binary Large OBject to dump out * @offset: offset in bytes to skip from the front of the BLOB, must be a multiple of sizeof(u32) * @size: the size in bytes of the BLOB, must be a multiple of sizeof(u32)
*/ void xe_print_blob_ascii85(struct drm_printer *p, constchar *prefix, char suffix, constvoid *blob, size_t offset, size_t size)
{ const u32 *blob32 = (const u32 *)blob; char buff[ASCII85_BUFSZ], *line_buff;
size_t line_pos = 0;
#define DMESG_MAX_LINE_LEN 800 /* Always leave space for the suffix char and the \0 */ #define MIN_SPACE (ASCII85_BUFSZ + 2) /* 85 + "<suffix>\0" */
if (size & 3)
drm_printf(p, "Size not word aligned: %zu", size); if (offset & 3)
drm_printf(p, "Offset not word aligned: %zu", offset);
line_buff = kzalloc(DMESG_MAX_LINE_LEN, GFP_ATOMIC); if (!line_buff) {
drm_printf(p, "Failed to allocate line buffer\n"); return;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.