if (dc_flush_wa)
cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
intel_ring_advance(rq, cs);
return 0;
}
int gen8_emit_flush_xcs(struct i915_request *rq, u32 mode)
{
u32 cmd, *cs;
cs = intel_ring_begin(rq, 4); if (IS_ERR(cs)) return PTR_ERR(cs);
cmd = MI_FLUSH_DW + 1;
/* * We always require a command barrier so that subsequent * commands, such as breadcrumb interrupts, are strictly ordered * wrt the contents of the write cache being flushed to memory * (and thus being coherent from the CPU).
*/
cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
if (mode & EMIT_INVALIDATE) {
cmd |= MI_INVALIDATE_TLB; if (rq->engine->class == VIDEO_DECODE_CLASS)
cmd |= MI_INVALIDATE_BSD;
}
/* * So far platforms supported by i915 having flat ccs do not require * AUX invalidation. Check also whether the engine requires it.
*/ return i915_mmio_reg_valid(reg) && !HAS_FLAT_CCS(engine->i915);
}
/* * On Aux CCS platforms the invalidation of the Aux * table requires quiescing memory traffic beforehand
*/ if (mode & EMIT_FLUSH || gen12_needs_ccs_aux_inv(engine)) {
u32 bit_group_0 = 0;
u32 bit_group_1 = 0; int err;
u32 *cs;
err = mtl_dummy_pipe_control(rq); if (err) return err;
bit_group_0 |= PIPE_CONTROL0_HDC_PIPELINE_FLUSH;
/* * When required, in MTL and beyond platforms we * need to set the CCS_FLUSH bit in the pipe control
*/ if (GRAPHICS_VER_FULL(rq->i915) >= IP_VER(12, 70))
bit_group_0 |= PIPE_CONTROL_CCS_FLUSH;
/* * L3 fabric flush is needed for AUX CCS invalidation * which happens as part of pipe-control so we can * ignore PIPE_CONTROL_FLUSH_L3. Also PIPE_CONTROL_FLUSH_L3 * deals with Protected Memory which is not needed for * AUX CCS invalidation and lead to unwanted side effects.
*/ if ((mode & EMIT_FLUSH) &&
GRAPHICS_VER_FULL(rq->i915) < IP_VER(12, 70))
bit_group_1 |= PIPE_CONTROL_FLUSH_L3;
count = 8; if (gen12_needs_ccs_aux_inv(rq->engine))
count += 8;
cs = intel_ring_begin(rq, count); if (IS_ERR(cs)) return PTR_ERR(cs);
/* * Prevent the pre-parser from skipping past the TLB * invalidate and loading a stale page for the batch * buffer / request payload.
*/
*cs++ = preparser_disable(true);
if (gen12_needs_ccs_aux_inv(rq->engine))
cmd += 8;
}
cs = intel_ring_begin(rq, cmd); if (IS_ERR(cs)) return PTR_ERR(cs);
if (mode & EMIT_INVALIDATE)
*cs++ = preparser_disable(true);
cmd = MI_FLUSH_DW + 1;
/* * We always require a command barrier so that subsequent * commands, such as breadcrumb interrupts, are strictly ordered * wrt the contents of the write cache being flushed to memory * (and thus being coherent from the CPU).
*/
cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
if (mode & EMIT_INVALIDATE) {
cmd |= MI_INVALIDATE_TLB; if (rq->engine->class == VIDEO_DECODE_CLASS)
cmd |= MI_INVALIDATE_BSD;
if (gen12_needs_ccs_aux_inv(rq->engine) &&
rq->engine->class == COPY_ENGINE_CLASS)
cmd |= MI_FLUSH_DW_CCS;
}
/* * Check if we have been preempted before we even get started. * * After this point i915_request_started() reports true, even if * we get preempted and so are no longer running. * * i915_request_started() is used during preemption processing * to decide if the request is currently inside the user payload * or spinning on a kernel semaphore (or earlier). For no-preemption * requests, we do allow preemption on the semaphore before the user * payload, but do not allow preemption once the request is started. * * i915_request_started() is similarly used during GPU hangs to * determine if the user's payload was guilty, and if so, the * request is banned. Before the request is started, it is assumed * to be unharmed and an innocent victim of another's hang.
*/
*cs++ = MI_NOOP;
*cs++ = MI_ARB_CHECK;
intel_ring_advance(rq, cs);
/* Record the updated position of the request's payload */
rq->infix = intel_ring_offset(rq, cs);
/* Fixup stray MI_SET_PREDICATE as it prevents us executing the ring */
*cs++ = MI_BATCH_BUFFER_START_GEN8;
*cs++ = wa_offset + DG2_PREDICATE_RESULT_BB;
*cs++ = 0;
cs = intel_ring_begin(rq, 4); if (IS_ERR(cs)) return PTR_ERR(cs);
/* * WaDisableCtxRestoreArbitration:bdw,chv * * We don't need to perform MI_ARB_ENABLE as often as we do (in * particular all the gen that do not need the w/a at all!), if we * took care to make sure that on every switch into this context * (both ordinary and for preemption) that arbitrartion was enabled * we would be fine. However, for gen8 there is another w/a that * requires us to not preempt inside GPGPU execution, so we keep * arbitration disabled for gen8 batches. Arbitration will be * re-enabled before we close the request * (engine->emit_fini_breadcrumb).
*/
*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
/* Can we unwind this request without appearing to go forwards? */
GEM_BUG_ON(intel_ring_direction(ring, rq->wa_tail, rq->head) <= 0);
}
/* * Reserve space for 2 NOOPs at the end of each request to be * used as a workaround for not being allowed to do lite * restore with HEAD==TAIL (WaIdleLiteRestore).
*/ static u32 *gen8_emit_wa_tail(struct i915_request *rq, u32 *cs)
{ /* Ensure there's always at least one preemption point per-request. */
*cs++ = MI_ARB_CHECK;
*cs++ = MI_NOOP;
rq->wa_tail = intel_ring_offset(rq, cs);
/* Check that entire request is less than half the ring */
assert_request_valid(rq);
/* * Note that the CS instruction pre-parser will not stall on the breadcrumb * flush and will continue pre-fetching the instructions after it before the * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at * BB_START/END instructions, so, even though we might pre-fetch the pre-amble * of the next request before the memory has been flushed, we're guaranteed that * we won't access the batch itself too early. * However, on gen12+ the parser can pre-fetch across the BB_START/END commands, * so, if the current request is modifying an instruction in the next request on * the same intel_context, we might pre-fetch and then execute the pre-update * instruction. To avoid this, the users of self-modifying code should either * disable the parser around the code emitting the memory writes, via a new flag * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For * the in-kernel use-cases we've opted to use a separate context, see * reloc_gpu() as an example. * All the above applies only to the instructions themselves. Non-inline data * used by the instructions is not pre-fetched.
*/
/* * When MI_ATOMIC_INLINE_DATA set this command must be 11 DW + (1 NOP) * to align. 4 DWs above + 8 filler DWs here.
*/ for (i = 0; i < 8; ++i)
*cs++ = 0;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.