/* * The per-platform tables are u8-encoded in @data. Decode @data and set the * addresses' offset and commands in @regs. The following encoding is used * for each byte. There are 2 steps: decoding commands and decoding addresses. * * Commands: * [7]: create NOPs - number of NOPs are set in lower bits * [6]: When creating MI_LOAD_REGISTER_IMM command, allow to set * MI_LRI_FORCE_POSTED * [5:0]: Number of NOPs or registers to set values to in case of * MI_LOAD_REGISTER_IMM * * Addresses: these are decoded after a MI_LOAD_REGISTER_IMM command by "count" * number of registers. They are set by using the REG/REG16 macros: the former * is used for offsets smaller than 0x200 while the latter is for values bigger * than that. Those macros already set all the bits documented below correctly: * * [7]: When a register offset needs more than 6 bits, use additional bytes, to * follow, for the lower bits * [6:0]: Register offset, without considering the engine base. * * This function only tweaks the commands and register offsets. Values are not * filled out.
*/ staticvoid set_offsets(u32 *regs, const u8 *data, conststruct intel_engine_cs *engine, bool close) #define NOP(x) (BIT(7) | (x)) #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6))) #define POSTED BIT(0) #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200)) #define REG16(x) \
(((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
(((x) >> 2) & 0x7f) #define END 0
{ const u32 base = engine->mmio_base;
*regs = MI_LOAD_REGISTER_IMM(count); if (flags & POSTED)
*regs |= MI_LRI_FORCE_POSTED; if (GRAPHICS_VER(engine->i915) >= 11)
*regs |= MI_LRI_LRM_CS_MMIO;
regs++;
GEM_BUG_ON(!count); do {
u32 offset = 0;
u8 v;
do {
v = *data++;
offset <<= 7;
offset |= v & ~BIT(7);
} while (v & BIT(7));
regs[0] = base + (offset << 2);
regs += 2;
} while (--count);
}
if (close) { /* Close the batch; used mainly by live_lrc_layout() */
*regs = MI_BATCH_BUFFER_END; if (GRAPHICS_VER(engine->i915) >= 11)
*regs |= BIT(0);
}
}
#undef END #undef REG16 #undef REG #undef LRI #undef NOP
staticconst u8 *reg_offsets(conststruct intel_engine_cs *engine)
{ /* * The gen12+ lists only have the registers we program in the basic * default state. We rely on the context image using relative * addressing to automatic fixup the register state between the * physical engines for virtual engine.
*/
GEM_BUG_ON(GRAPHICS_VER(engine->i915) >= 12 &&
!intel_engine_has_relative_mmio(engine));
/* * Wa_14019159160 - Case 2. * On some platforms, protected contexts require setting * the LRC run-alone bit or else the encryption/decryption will not happen. * NOTE: Case 2 only applies to PXP use-case of said workaround.
*/ if (GRAPHICS_VER_FULL(ce->engine->i915) >= IP_VER(12, 70) &&
(ce->engine->class == COMPUTE_CLASS || ce->engine->class == RENDER_CLASS)) {
rcu_read_lock();
gem_ctx = rcu_dereference(ce->gem_context); if (gem_ctx)
ctx_is_protected = gem_ctx->uses_protected_content;
rcu_read_unlock();
}
staticvoid __lrc_init_regs(u32 *regs, conststruct intel_context *ce, conststruct intel_engine_cs *engine, bool inhibit)
{ /* * A context is actually a big batch buffer with several * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The * values we are setting here are only for the first context restore: * on a subsequent save, the GPU will recreate this batchbuffer with new * values (including all the missing MI_LOAD_REGISTER_IMM commands that * we are not initializing here). * * Must keep consistent with virtual_update_register_offsets().
*/
/* * per_ctx below determines which WABB section is used. * When true, the function returns the location of the * PER_CTX_BB. When false, the function returns the * location of the INDIRECT_CTX.
*/ static u32 *context_wabb(conststruct intel_context *ce, bool per_ctx)
{ void *ptr;
GEM_BUG_ON(!ce->wa_bb_page);
ptr = ce->lrc_reg_state;
ptr -= LRC_STATE_OFFSET; /* back to start of context image */
ptr += context_wa_bb_offset(ce);
ptr += per_ctx ? PAGE_SIZE : 0;
/* Clear the indirect wa and storage */ if (ce->wa_bb_page)
memset(state + context_wa_bb_offset(ce), 0, PAGE_SIZE);
/* * The second page of the context object contains some registers which * must be set up prior to the first execution.
*/
__lrc_init_regs(state + LRC_STATE_OFFSET, ce, engine, inhibit);
}
obj = i915_gem_object_create_lmem(engine->i915, context_size,
I915_BO_ALLOC_PM_VOLATILE); if (IS_ERR(obj)) {
obj = i915_gem_object_create_shmem(engine->i915, context_size); if (IS_ERR(obj)) return ERR_CAST(obj);
/* * Wa_22016122933: For Media version 13.0, all Media GT shared * memory needs to be mapped as WC on CPU side and UC (PAT * index 2) on GPU side.
*/ if (intel_gt_needs_wa_22016122933(engine->gt))
i915_gem_object_set_cache_coherency(obj, I915_CACHE_NONE);
}
int lrc_alloc(struct intel_context *ce, struct intel_engine_cs *engine)
{ struct intel_ring *ring; struct i915_vma *vma; int err;
GEM_BUG_ON(ce->state);
if (!intel_context_has_own_state(ce))
ce->default_state = engine->default_state;
vma = __lrc_alloc_state(ce, engine); if (IS_ERR(vma)) return PTR_ERR(vma);
ring = intel_engine_create_ring(engine, ce->ring_size); if (IS_ERR(ring)) {
err = PTR_ERR(ring); goto err_vma;
}
if (!page_mask_bits(ce->timeline)) { struct intel_timeline *tl;
/* * Use the static global HWSP for the kernel context, and * a dynamically allocated cacheline for everyone else.
*/ if (unlikely(ce->timeline))
tl = pinned_timeline(ce, engine); else
tl = intel_timeline_create(engine->gt); if (IS_ERR(tl)) {
err = PTR_ERR(tl); goto err_ring;
}
/* * The bspec's tuning guide asks us to program a vertical watermark value of * 0x3FF. However this register is not saved/restored properly by the * hardware, so we're required to apply the desired value via INDIRECT_CTX * batch buffer to ensure the value takes effect properly. All other bits * in this register should remain at 0 (the hardware default).
*/ static u32 *
dg2_emit_draw_watermark_setting(u32 *cs)
{
*cs++ = MI_LOAD_REGISTER_IMM(1);
*cs++ = i915_mmio_reg_offset(DRAW_WATERMARK);
*cs++ = REG_FIELD_PREP(VERT_WM_VAL, 0x3FF);
/* * The context descriptor encodes various attributes of a context, * including its GTT address and some flags. Because it's fairly * expensive to calculate, we'll just do it once and cache the result, * which remains valid until the context is unpinned. * * This is what a descriptor looks like, from LSB to MSB:: * * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template) * bits 12-31: LRCA, GTT address of (the HWSP of) this context * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC) * bits 53-54: mbz, reserved for use by hardware * bits 55-63: group ID, currently unused and set to 0 * * Starting from Gen11, the upper dword of the descriptor has a new format: * * bits 32-36: reserved * bits 37-47: SW context ID * bits 48:53: engine instance * bit 54: mbz, reserved for use by hardware * bits 55-60: SW counter * bits 61-63: engine class * * On Xe_HP, the upper dword of the descriptor has a new format: * * bits 32-37: virtual function number * bit 38: mbz, reserved for use by hardware * bits 39-54: SW context ID * bits 55-57: reserved * bits 58-63: SW counter * * engine info, SW context ID and SW counter need to form a unique number * (Context ID) per lrc.
*/ static u32 lrc_descriptor(conststruct intel_context *ce)
{
u32 desc;
WARN_ONCE(!valid, "Invalid lrc state found %s submission\n", when);
}
/* * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after * PIPE_CONTROL instruction. This is required for the flush to happen correctly * but there is a slight complication as this is applied in WA batch where the * values are only initialized once so we cannot take register value at the * beginning and reuse it further; hence we save its value to memory, upload a * constant value with bit21 set and then we restore it back with the saved value. * To simplify the WA, a constant value is formed by using the default value * of this register. This shouldn't be a problem because we are only modifying * it for a short period and this batch in non-premptible. We can ofcourse * use additional instructions that read the actual value of the register * at that time and set our bit of interest but it makes the WA complicated. * * This WA is also required for Gen9 so extracting as a function avoids * code duplication.
*/ static u32 *
gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
{ /* NB no one else is allowed to scribble over scratch + 256! */
*batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
*batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
*batch++ = intel_gt_scratch_offset(engine->gt,
INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
*batch++ = 0;
/* * Typically we only have one indirect_ctx and per_ctx batch buffer which are * initialized at the beginning and shared across all contexts but this field * helps us to have multiple batches at different offsets and select them based * on a criteria. At the moment this batch always start at the beginning of the page * and at this point we don't have multiple wa_ctx batch buffers. * * The number of WA applied are not known at the beginning; we use this field * to return the no of DWORDS written. * * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END * so it adds NOOPs as padding to make it cacheline aligned. * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together * makes a complete batch buffer.
*/ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
{ /* WaDisableCtxRestoreArbitration:bdw,chv */
*batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ if (IS_BROADWELL(engine->i915))
batch = gen8_emit_flush_coherentl3_wa(engine, batch);
/* WaClearSlmSpaceAtContextSwitch:bdw,chv */ /* Actual scratch location is at 128 bytes offset */
batch = gen8_emit_pipe_control(batch,
PIPE_CONTROL_FLUSH_L3 |
PIPE_CONTROL_STORE_DATA_INDEX |
PIPE_CONTROL_CS_STALL |
PIPE_CONTROL_QW_WRITE,
LRC_PPHWSP_SCRATCH_ADDR);
*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
/* Pad to end of cacheline */ while ((unsignedlong)batch % CACHELINE_BYTES)
*batch++ = MI_NOOP;
/* * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because * execution depends on the length specified in terms of cache lines * in the register CTX_RCS_INDIRECT_CTX
*/
/* WaMediaPoolStateCmdInWABB:bxt,glk */ if (HAS_POOLED_EU(engine->i915)) { /* * EU pool configuration is setup along with golden context * during context initialization. This value depends on * device type (2x6 or 3x6) and needs to be updated based * on which subslice is disabled especially for 2x6 * devices, however it is safe to load default * configuration of 3x6 device instead of masking off * corresponding bits because HW ignores bits of a disabled * subslice and drops down to appropriate config. Please * see render_state_setup() in i915_gem_render_state.c for * possible configurations, to avoid duplication they are * not shown here again.
*/
*batch++ = GEN9_MEDIA_POOL_STATE;
*batch++ = GEN9_MEDIA_POOL_ENABLE;
*batch++ = 0x00777000;
*batch++ = 0;
*batch++ = 0;
*batch++ = 0;
}
*batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
/* Pad to end of cacheline */ while ((unsignedlong)batch % CACHELINE_BYTES)
*batch++ = MI_NOOP;
err = lrc_create_wa_ctx(engine); if (err) { /* * We continue even if we fail to initialize WA batch * because we only expect rare glitches but nothing * critical to prevent us from using GPU
*/
drm_err(&engine->i915->drm, "Ignoring context switch w/a allocation error:%d\n",
err); return;
}
if (!engine->wa_ctx.vma) return;
i915_gem_ww_ctx_init(&ww, true);
retry:
err = i915_gem_object_lock(wa_ctx->vma->obj, &ww); if (!err)
err = i915_ggtt_pin(wa_ctx->vma, &ww, 0, PIN_HIGH); if (err) goto err;
/* * Emit the two workaround batch buffers, recording the offset from the * start of the workaround batch buffer object for each and their * respective sizes.
*/
batch_ptr = batch; for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
wa_bb[i]->offset = batch_ptr - batch; if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
CACHELINE_BYTES))) {
err = -EINVAL; break;
} if (wa_bb_fn[i])
batch_ptr = wa_bb_fn[i](engine, batch_ptr);
wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
}
GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_SIZE);
static u32 lrc_get_runtime(conststruct intel_context *ce)
{ /* * We can use either ppHWSP[16] which is recorded before the context * switch (and so excludes the cost of context switches) or use the * value from the context image itself, which is saved/restored earlier * and so includes the cost of the save.
*/ return READ_ONCE(ce->lrc_reg_state[CTX_TIMESTAMP]);
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.