/* * While the engine is active, we send a periodic pulse along the engine * to check on its health and to flush any idle-barriers. If that request * is stuck, and we fail to preempt it, we declare the engine hung and * issue a reset -- in the hope that restores progress.
*/
staticbool next_heartbeat(struct intel_engine_cs *engine)
{ struct i915_request *rq; long delay;
/* * FIXME: The final period extension is disabled if the period has been * modified from the default. This is to prevent issues with certain * selftests which override the value and expect specific behaviour. * Once the selftests have been updated to either cope with variable * heartbeat periods (or to override the pre-emption timeout as well, * or just to add a selftest specific override of the extension), the * generic override can be removed.
*/ if (rq && rq->sched.attr.priority >= I915_PRIORITY_BARRIER &&
delay == engine->defaults.heartbeat_interval_ms) { long longer;
/* * The final try is at the highest priority possible. Up until now * a pre-emption might not even have been attempted. So make sure * this last attempt allows enough time for a pre-emption to occur.
*/
longer = READ_ONCE(engine->props.preempt_timeout_ms) * 2;
longer = intel_clamp_heartbeat_interval_ms(engine, longer); if (longer > delay)
delay = longer;
}
if (intel_engine_uses_guc(engine)) /* * GuC itself is toast or GuC's hang detection * is disabled. Either way, need to find the * hang culprit manually.
*/
intel_guc_find_hung_context(engine);
intel_gt_handle_error(engine->gt, engine->mask,
I915_ERROR_CAPTURE, "stopped heartbeat on %s",
engine->name);
}
if (!intel_engine_pm_get_if_awake(engine)) return;
if (intel_gt_is_wedged(engine->gt)) goto out;
if (i915_sched_engine_disabled(engine->sched_engine)) {
reset_engine(engine, engine->heartbeat.systole); goto out;
}
if (engine->heartbeat.systole) { long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
/* Safeguard against too-fast worker invocations */ if (!time_after(jiffies,
rq->emitted_jiffies + msecs_to_jiffies(delay))) goto out;
if (!i915_sw_fence_signaled(&rq->submit)) { /* * Not yet submitted, system is stalled. * * This more often happens for ring submission, * where all contexts are funnelled into a common * ringbuffer. If one context is blocked on an * external fence, not only is it not submitted, * but all other contexts, including the kernel * context are stuck waiting for the signal.
*/
} elseif (engine->sched_engine->schedule &&
rq->sched.attr.priority < I915_PRIORITY_BARRIER) { /* * Gradually raise the priority of the heartbeat to * give high priority work [which presumably desires * low latency and no jitter] the chance to naturally * complete before being preempted.
*/
attr.priority = I915_PRIORITY_NORMAL; if (rq->sched.attr.priority >= attr.priority)
attr.priority = I915_PRIORITY_HEARTBEAT; if (rq->sched.attr.priority >= attr.priority)
attr.priority = I915_PRIORITY_BARRIER;
serial = READ_ONCE(engine->serial); if (engine->wakeref_serial == serial) goto out;
if (!mutex_trylock(&ce->timeline->mutex)) { /* Unable to lock the kernel timeline, is the engine stuck? */ if (xchg(&engine->heartbeat.blocked, serial) == serial)
intel_gt_handle_error(engine->gt, engine->mask,
I915_ERROR_CAPTURE, "no heartbeat on %s",
engine->name); goto out;
}
rq = heartbeat_create(ce, GFP_NOWAIT | __GFP_NOWARN); if (IS_ERR(rq)) goto unlock;
heartbeat_commit(rq, &attr);
unlock:
mutex_unlock(&ce->timeline->mutex);
out: if (!engine->i915->params.enable_hangcheck || !next_heartbeat(engine))
i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
intel_engine_pm_put(engine);
}
void intel_engine_unpark_heartbeat(struct intel_engine_cs *engine)
{ if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL) return;
next_heartbeat(engine);
}
void intel_engine_park_heartbeat(struct intel_engine_cs *engine)
{ if (cancel_delayed_work(&engine->heartbeat.work))
i915_request_put(fetch_and_zero(&engine->heartbeat.systole));
}
old = xchg(&engine->props.heartbeat_interval_ms, delay); if (delay)
intel_engine_unpark_heartbeat(engine); else
intel_engine_park_heartbeat(engine);
return old;
}
int intel_engine_set_heartbeat(struct intel_engine_cs *engine, unsignedlong delay)
{ struct intel_context *ce = engine->kernel_context; int err = 0;
if (!delay && !intel_engine_has_preempt_reset(engine)) return -ENODEV;
/* FIXME: Remove together with equally marked hack in next_heartbeat. */ if (delay != engine->defaults.heartbeat_interval_ms &&
delay < 2 * engine->props.preempt_timeout_ms) { if (intel_engine_uses_guc(engine))
drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may downgrade individual engine resets to full GPU resets!\n",
engine->name); else
drm_notice(&engine->i915->drm, "%s heartbeat interval adjusted to a non-default value which may cause engine resets to target innocent contexts!\n",
engine->name);
}
intel_engine_pm_get(engine);
err = mutex_lock_interruptible(&ce->timeline->mutex); if (err) goto out_rpm;
if (delay != engine->props.heartbeat_interval_ms) { unsignedlong saved = set_heartbeat(engine, delay);
/* recheck current execution */ if (intel_engine_has_preemption(engine)) {
err = __intel_engine_pulse(engine); if (err)
set_heartbeat(engine, saved);
}
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.