/** This vma's place in the execbuf reservation list */ struct drm_i915_gem_exec_object2 *exec; struct list_head bind_link; struct list_head reloc_link;
struct hlist_node node;
u32 handle;
};
enum {
FORCE_CPU_RELOC = 1,
FORCE_GTT_RELOC,
FORCE_GPU_RELOC, #define DBG_FORCE_RELOC 0 /* choose one of the above! */
};
/* __EXEC_OBJECT_ flags > BIT(29) defined in i915_vma.h */ #define __EXEC_OBJECT_HAS_PIN BIT(29) #define __EXEC_OBJECT_HAS_FENCE BIT(28) #define __EXEC_OBJECT_USERPTR_INIT BIT(27) #define __EXEC_OBJECT_NEEDS_MAP BIT(26) #define __EXEC_OBJECT_NEEDS_BIAS BIT(25) #define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 25) /* all of the above + */ #define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE)
/* Catch emission of unexpected errors for CI! */ #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) #undef EINVAL #define EINVAL ({ \
DRM_DEBUG_DRIVER("EINVAL at %s:%d\n", __func__, __LINE__); \
22; \
}) #endif
/** * DOC: User command execution * * Userspace submits commands to be executed on the GPU as an instruction * stream within a GEM object we call a batchbuffer. This instructions may * refer to other GEM objects containing auxiliary state such as kernels, * samplers, render targets and even secondary batchbuffers. Userspace does * not know where in the GPU memory these objects reside and so before the * batchbuffer is passed to the GPU for execution, those addresses in the * batchbuffer and auxiliary objects are updated. This is known as relocation, * or patching. To try and avoid having to relocate each object on the next * execution, userspace is told the location of those objects in this pass, * but this remains just a hint as the kernel may choose a new location for * any object in the future. * * At the level of talking to the hardware, submitting a batchbuffer for the * GPU to execute is to add content to a buffer from which the HW * command streamer is reading. * * 1. Add a command to load the HW context. For Logical Ring Contexts, i.e. * Execlists, this command is not placed on the same buffer as the * remaining items. * * 2. Add a command to invalidate caches to the buffer. * * 3. Add a batchbuffer start command to the buffer; the start command is * essentially a token together with the GPU address of the batchbuffer * to be executed. * * 4. Add a pipeline flush to the buffer. * * 5. Add a memory write command to the buffer to record when the GPU * is done executing the batchbuffer. The memory write writes the * global sequence number of the request, ``i915_request::global_seqno``; * the i915 driver uses the current value in the register to determine * if the GPU has completed the batchbuffer. * * 6. Add a user interrupt command to the buffer. This command instructs * the GPU to issue an interrupt when the command, pipeline flush and * memory write are completed. * * 7. Inform the hardware of the additional commands added to the buffer * (by updating the tail pointer). * * Processing an execbuf ioctl is conceptually split up into a few phases. * * 1. Validation - Ensure all the pointers, handles and flags are valid. * 2. Reservation - Assign GPU address space for every object * 3. Relocation - Update any addresses to point to the final locations * 4. Serialisation - Order the request with respect to its dependencies * 5. Construction - Construct a request to execute the batchbuffer * 6. Submission (at some point in the future execution) * * Reserving resources for the execbuf is the most complicated phase. We * neither want to have to migrate the object in the address space, nor do * we want to have to update any relocations pointing to this object. Ideally, * we want to leave the object where it is and for all the existing relocations * to match. If the object is given a new address, or if userspace thinks the * object is elsewhere, we have to parse all the relocation entries and update * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that * all the target addresses in all of its objects match the value in the * relocation entries and that they all match the presumed offsets given by the * list of execbuffer objects. Using this knowledge, we know that if we haven't * moved any buffers, all the relocation entries are valid and we can skip * the update. (If userspace is wrong, the likely outcome is an impromptu GPU * hang.) The requirement for using I915_EXEC_NO_RELOC are: * * The addresses written in the objects must match the corresponding * reloc.presumed_offset which in turn must match the corresponding * execobject.offset. * * Any render targets written to in the batch must be flagged with * EXEC_OBJECT_WRITE. * * To avoid stalling, execobject.offset should match the current * address of that object within the active context. * * The reservation is done is multiple phases. First we try and keep any * object already bound in its current location - so as long as meets the * constraints imposed by the new execbuffer. Any object left unbound after the * first pass is then fitted into any available idle space. If an object does * not fit, all objects are removed from the reservation and the process rerun * after sorting the objects into a priority order (more difficult to fit * objects are tried first). Failing that, the entire VM is cleared and we try * to fit the execbuf once last time before concluding that it simply will not * fit. * * A small complication to all of this is that we allow userspace not only to * specify an alignment and a size for the object in the address space, but * we also allow userspace to specify the exact offset. This objects are * simpler to place (the location is known a priori) all we have to do is make * sure the space is available. * * Once all the objects are in place, patching up the buried pointers to point * to the final locations is a fairly simple job of walking over the relocation * entry arrays, looking up the right address and rewriting the value into * the object. Simple! ... The relocation entries are stored in user memory * and so to access them we have to copy them into a local buffer. That copy * has to avoid taking any pagefaults as they may lead back to a GEM object * requiring the struct_mutex (i.e. recursive deadlock). So once again we split * the relocation into multiple passes. First we try to do everything within an * atomic context (avoid the pagefaults) which requires that we never wait. If * we detect that we may wait, or if we need to fault, then we have to fallback * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm * bells yet?) Dropping the mutex means that we lose all the state we have * built up so far for the execbuf and we must reset any global data. However, * we do leave the objects pinned in their final locations - which is a * potential issue for concurrent execbufs. Once we have left the mutex, we can * allocate and copy all the relocation entries into a large array at our * leisure, reacquire the mutex, reclaim all the objects and other state and * then proceed to update any incorrect addresses with the objects. * * As we process the relocation entries, we maintain a record of whether the * object is being written to. Using NORELOC, we expect userspace to provide * this information instead. We also check whether we can skip the relocation * by comparing the expected value inside the relocation entry with the target's * final address. If they differ, we have to map the current object and rewrite * the 4 or 8 byte pointer within. * * Serialising an execbuf is quite simple according to the rules of the GEM * ABI. Execution within each context is ordered by the order of submission. * Writes to any GEM object are in order of submission and are exclusive. Reads * from a GEM object are unordered with respect to other reads, but ordered by * writes. A write submitted after a read cannot occur before the read, and * similarly any read submitted after a write cannot occur before the write. * Writes are ordered between engines such that only one write occurs at any * time (completing any reads beforehand) - using semaphores where available * and CPU serialisation otherwise. Other GEM access obey the same rules, any * write (either via mmaps using set-domain, or via pwrite) must flush all GPU * reads before starting, and any read (either using set-domain or pread) must * flush all GPU writes before starting. (Note we only employ a barrier before, * we currently rely on userspace not concurrently starting a new execution * whilst reading or writing to an object. This may be an advantage or not * depending on how much you trust userspace not to shoot themselves in the * foot.) Serialisation may just result in the request being inserted into * a DAG awaiting its turn, but most simple is to wait on the CPU until * all dependencies are resolved. * * After all of that, is just a matter of closing the request and handing it to * the hardware (well, leaving it in a queue to be executed). However, we also * offer the ability for batchbuffers to be run with elevated privileges so * that they access otherwise hidden registers. (Used to adjust L3 cache etc.) * Before any batch is given extra privileges we first must check that it * contains no nefarious instructions, we check that each instruction is from * our whitelist and all registers are also from an allowed list. We first * copy the user's batchbuffer to a shadow (so that the user doesn't have * access to it, either by the CPU or GPU as we scan it) and then parse each * instruction. If everything is ok, we set a flag telling the hardware to run * the batchbuffer in trusted mode, otherwise the ioctl is rejected.
*/
struct eb_fence { struct drm_syncobj *syncobj; /* Use with ptr_mask_bits() */ struct dma_fence *dma_fence;
u64 value; struct dma_fence_chain *chain_fence;
};
struct intel_gt *gt; /* gt for the execbuf */ struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */
intel_wakeref_t wakeref;
intel_wakeref_t wakeref_gt0;
/** our requests to build */ struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; /** identity of the batch obj/vma */ struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; struct i915_vma *trampoline; /** trampoline used for chaining */
/** used for excl fence in dma_resv objects when > 1 BB submitted */ struct dma_fence *composite_fence;
/** actual size of execobj[] as we may extend it for the cmdparser */ unsignedint buffer_count;
/* number of batches in execbuf IOCTL */ unsignedint num_batches;
/** list of vma not yet bound during reservation phase */ struct list_head unbound;
/** list of vma that have execobj.relocation_count */ struct list_head relocs;
struct i915_gem_ww_ctx ww;
/** * Track the most recently used object for relocations, as we * frequently have to perform multiple relocations within the same * obj/page
*/ struct reloc_cache { struct drm_mm_node node; /** temporary GTT binding */ unsignedlong vaddr; /** Current kmap address */ unsignedlong page; /** Currently mapped page index */ unsignedint graphics_ver; /** Cached value of GRAPHICS_VER */ bool use_64bit_reloc : 1; bool has_llc : 1; bool has_fence : 1; bool needs_unfenced : 1;
} reloc_cache;
u64 invalid_flags; /** Set of execobj.flags that are invalid */
/** Length of batch within object */
u64 batch_len[MAX_ENGINE_INSTANCE + 1];
u32 batch_start_offset; /** Location within object of batch */
u32 batch_flags; /** Flags composed for emit_bb_start() */ struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */
/** * Indicate either the size of the hashtable used to resolve * relocation handles, or if negative that we are using a direct * index into the execobj[].
*/ int lut_size; struct hlist_head *buckets; /** ht for relocation handles */
/* * Without a 1:1 association between relocation handles and * the execobject[] index, we instead create a hashtable. * We size it dynamically based on available memory, starting * first with 1:1 associative hash and scaling back until * the allocation succeeds. * * Later on we use a positive lut_size to indicate we are * using this hashtable, and a negative value to indicate a * direct lookup.
*/ do {
gfp_t flags;
/* While we can still reduce the allocation size, don't * raise a warning and allow the allocation to fail. * On the last pass though, we want to try as hard * as possible to perform the allocation and warn * if it fails.
*/
flags = GFP_KERNEL; if (size > 1)
flags |= __GFP_NORETRY | __GFP_NOWARN;
eb->buckets = kzalloc(sizeof(struct hlist_head) << size,
flags); if (eb->buckets) break;
} while (--size);
if (exec_flags & EXEC_OBJECT_NEEDS_GTT)
pin_flags |= PIN_GLOBAL;
/* * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset, * limit address to the first 4GBs for unflagged objects.
*/ if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
pin_flags |= PIN_ZONE_4G;
if (exec_flags & __EXEC_OBJECT_NEEDS_MAP)
pin_flags |= PIN_MAPPABLE;
/* Attempt to reuse the current location if available */
err = i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags); if (err == -EDEADLK) return err;
if (unlikely(err)) { if (entry->flags & EXEC_OBJECT_PINNED) return err;
/* Failing that pick any _free_ space if suitable */
err = i915_vma_pin_ww(vma, &eb->ww,
entry->pad_to_size,
entry->alignment,
eb_pin_flags(entry, ev->flags) |
PIN_USER | PIN_NOEVICT | PIN_VALIDATE); if (unlikely(err)) return err;
}
if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_FENCE)) {
err = i915_vma_pin_fence(vma); if (unlikely(err)) return err;
if (vma->fence)
ev->flags |= __EXEC_OBJECT_HAS_FENCE;
}
ev->flags |= __EXEC_OBJECT_HAS_PIN; if (eb_vma_misplaced(entry, vma, ev->flags)) return -EBADSLT;
return 0;
}
staticvoid
eb_unreserve_vma(struct eb_vma *ev)
{ if (unlikely(ev->flags & __EXEC_OBJECT_HAS_FENCE))
__i915_vma_unpin_fence(ev->vma);
ev->flags &= ~__EXEC_OBJECT_RESERVED;
}
staticint
eb_validate_vma(struct i915_execbuffer *eb, struct drm_i915_gem_exec_object2 *entry, struct i915_vma *vma)
{ /* Relocations are disallowed for all platforms after TGL-LP. This * also covers all platforms with local memory.
*/ if (entry->relocation_count &&
GRAPHICS_VER(eb->i915) >= 12 && !IS_TIGERLAKE(eb->i915)) return -EINVAL;
if (unlikely(entry->flags & eb->invalid_flags)) return -EINVAL;
if (unlikely(entry->alignment &&
!is_power_of_2_u64(entry->alignment))) return -EINVAL;
/* * Offset can be used as input (EXEC_OBJECT_PINNED), reject * any non-page-aligned or non-canonical addresses.
*/ if (unlikely(entry->flags & EXEC_OBJECT_PINNED &&
entry->offset != gen8_canonical_addr(entry->offset & I915_GTT_PAGE_MASK))) return -EINVAL;
/* pad_to_size was once a reserved field, so sanitize it */ if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) { if (unlikely(offset_in_page(entry->pad_to_size))) return -EINVAL;
} else {
entry->pad_to_size = 0;
} /* * From drm_mm perspective address space is continuous, * so from this point we're always using non-canonical * form internally.
*/
entry->offset = gen8_noncanonical_addr(entry->offset);
if (entry->relocation_count)
list_add_tail(&ev->reloc_link, &eb->relocs);
/* * SNA is doing fancy tricks with compressing batch buffers, which leads * to negative relocation deltas. Usually that works out ok since the * relocate address is still positive, except when the batch is placed * very low in the GTT. Ensure this doesn't happen. * * Note that actual hangs have only been observed on gen7, but for * paranoia do it everywhere.
*/ if (is_batch_buffer(eb, i)) { if (entry->relocation_count &&
!(ev->flags & EXEC_OBJECT_PINNED))
ev->flags |= __EXEC_OBJECT_NEEDS_BIAS; if (eb->reloc_cache.has_fence)
ev->flags |= EXEC_OBJECT_NEEDS_FENCE;
eb->batches[*current_batch] = ev;
if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) {
drm_dbg(&i915->drm, "Attempting to use self-modifying batch buffer\n"); return -EINVAL;
}
if (range_overflows_t(u64,
eb->batch_start_offset,
eb->args->batch_len,
ev->vma->size)) {
drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); return -EINVAL;
}
if (DBG_FORCE_RELOC == FORCE_CPU_RELOC) returntrue;
if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) returnfalse;
/* * For objects created by userspace through GEM_CREATE with pat_index * set by set_pat extension, i915_gem_object_has_cache_level() always * return true, otherwise the call would fall back to checking whether * the object is un-cached.
*/ return (cache->has_llc ||
obj->cache_dirty ||
!i915_gem_object_has_cache_level(obj, I915_CACHE_NONE));
}
/* * We have one more buffers that we couldn't bind, which could be due to * various reasons. To resolve this we have 4 passes, with every next * level turning the screws tighter: * * 0. Unbind all objects that do not match the GTT constraints for the * execbuffer (fenceable, mappable, alignment etc). Bind all new * objects. This avoids unnecessary unbinding of later objects in order * to make room for the earlier objects *unless* we need to defragment. * * 1. Reorder the buffers, where objects with the most restrictive * placement requirements go first (ignoring fixed location buffers for * now). For example, objects needing the mappable aperture (the first * 256M of GTT), should go first vs objects that can be placed just * about anywhere. Repeat the previous pass. * * 2. Consider buffers that are pinned at a fixed location. Also try to * evict the entire VM this time, leaving only objects that we were * unable to lock. Try again to bind the buffers. (still using the new * buffer order). * * 3. We likely have object lock contention for one or more stubborn * objects in the VM, for which we need to evict to make forward * progress (perhaps we are fighting the shrinker?). When evicting the * VM this time around, anything that we can't lock we now track using * the busy_bo, using the full lock (after dropping the vm->mutex to * prevent deadlocks), instead of trylock. We then continue to evict the * VM, this time with the stubborn object locked, which we can now * hopefully unbind (if still bound in the VM). Repeat until the VM is * evicted. Finally we should be able bind everything.
*/ for (pass = 0; pass <= 3; pass++) { int pin_flags = PIN_USER | PIN_VALIDATE;
if (pass == 0)
pin_flags |= PIN_NONBLOCK;
if (pass >= 1)
eb_unbind(eb, pass >= 2);
if (pass == 2) {
err = mutex_lock_interruptible(&eb->context->vm->mutex); if (!err) {
err = i915_gem_evict_vm(eb->context->vm, &eb->ww, NULL);
mutex_unlock(&eb->context->vm->mutex);
} if (err) return err;
}
if (pass == 3) {
retry:
err = mutex_lock_interruptible(&eb->context->vm->mutex); if (!err) { struct drm_i915_gem_object *busy_bo = NULL;
err = i915_gem_evict_vm(eb->context->vm, &eb->ww, &busy_bo);
mutex_unlock(&eb->context->vm->mutex); if (err && busy_bo) {
err = i915_gem_object_lock(busy_bo, &eb->ww);
i915_gem_object_put(busy_bo); if (!err) goto retry;
}
} if (err) return err;
}
lut = i915_lut_handle_alloc(); if (unlikely(!lut)) return -ENOMEM;
i915_vma_get(vma); if (!atomic_fetch_inc(&vma->open_count))
i915_vma_reopen(vma);
lut->handle = handle;
lut->ctx = ctx;
/* Check that the context hasn't been closed in the meantime */
err = -EINTR; if (!mutex_lock_interruptible(&ctx->lut_mutex)) { if (likely(!i915_gem_context_is_closed(ctx)))
err = radix_tree_insert(&ctx->handles_vma, handle, vma); else
err = -ENOENT; if (err == 0) { /* And nor has this handle */ struct drm_i915_gem_object *obj = vma->obj;
do { struct drm_i915_gem_object *obj; struct i915_vma *vma; int err;
rcu_read_lock();
vma = radix_tree_lookup(&eb->gem_context->handles_vma, handle); if (likely(vma && vma->vm == vm))
vma = i915_vma_tryget(vma);
rcu_read_unlock(); if (likely(vma)) return vma;
obj = i915_gem_object_lookup(eb->file, handle); if (unlikely(!obj)) return ERR_PTR(-ENOENT);
/* * If the user has opted-in for protected-object tracking, make * sure the object encryption can be used. * We only need to do this when the object is first used with * this context, because the context itself will be banned when * the protected objects become invalid.
*/ if (i915_gem_context_uses_protected_content(eb->gem_context) &&
i915_gem_object_is_protected(obj)) {
err = intel_pxp_key_check(intel_bo_to_drm_bo(obj), true); if (err) {
i915_gem_object_put(obj); return ERR_PTR(err);
}
}
err = eb_add_vma(eb, ¤t_batch, i, vma); if (err) return err;
if (i915_gem_object_is_userptr(vma->obj)) {
err = i915_gem_object_userptr_submit_init(vma->obj); if (err) { if (i + 1 < eb->buffer_count) { /* * Execbuffer code expects last vma entry to be NULL, * since we already initialized this entry, * set the next value to NULL or we mess up * cleanup handling.
*/
eb->vma[i + 1].vma = NULL;
}
if (cache->vaddr) {
intel_gt_flush_ggtt_writes(ggtt->vm.gt);
io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr));
} else { struct i915_vma *vma = ERR_PTR(-ENODEV); int err;
if (i915_gem_object_is_tiled(obj)) return ERR_PTR(-EINVAL);
if (use_cpu_reloc(cache, obj)) return NULL;
err = i915_gem_object_set_to_gtt_domain(obj, true); if (err) return ERR_PTR(err);
/* * i915_gem_object_ggtt_pin_ww may attempt to remove the batch * VMA from the object list because we no longer pin. * * Only attempt to pin the batch buffer to ggtt if the current batch * is not inside ggtt, or the batch buffer is not misplaced.
*/ if (!i915_is_ggtt(batch->vm) ||
!i915_vma_misplaced(batch, 0, 0, PIN_MAPPABLE)) {
vma = i915_gem_object_ggtt_pin_ww(obj, &eb->ww, NULL, 0, 0,
PIN_MAPPABLE |
PIN_NONBLOCK /* NOWARN */ |
PIN_NOEVICT);
}
if (vma == ERR_PTR(-EDEADLK)) return vma;
if (IS_ERR(vma)) {
memset(&cache->node, 0, sizeof(cache->node));
mutex_lock(&ggtt->vm.mutex);
err = drm_mm_insert_node_in_range
(&ggtt->vm.mm, &cache->node,
PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE,
0, ggtt->mappable_end,
DRM_MM_INSERT_LOW);
mutex_unlock(&ggtt->vm.mutex); if (err) /* no inactive aperture space, use cpu reloc */ return NULL;
} else {
cache->node.start = i915_ggtt_offset(vma);
cache->node.mm = (void *)vma;
}
}
/* * Writes to the same cacheline are serialised by the CPU * (including clflush). On the write path, we only require * that it hits memory in an orderly fashion and place * mb barriers at the start and end of the relocation phase * to ensure ordering of clflush wrt to the system.
*/ if (flushes & CLFLUSH_AFTER)
drm_clflush_virt_range(addr, sizeof(*addr));
} else
*addr = value;
}
/* we've already hold a reference to all valid objects */
target = eb_get_vma(eb, reloc->target_handle); if (unlikely(!target)) return -ENOENT;
/* Validate that the target is in a valid r/w GPU domain */ if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) {
drm_dbg(&i915->drm, "reloc with multiple write domains: " "target %d offset %d " "read %08x write %08x\n",
reloc->target_handle,
(int) reloc->offset,
reloc->read_domains,
reloc->write_domain); return -EINVAL;
} if (unlikely((reloc->write_domain | reloc->read_domains)
& ~I915_GEM_GPU_DOMAINS)) {
drm_dbg(&i915->drm, "reloc with read/write non-GPU domains: " "target %d offset %d " "read %08x write %08x\n",
reloc->target_handle,
(int) reloc->offset,
reloc->read_domains,
reloc->write_domain); return -EINVAL;
}
if (reloc->write_domain) {
target->flags |= EXEC_OBJECT_WRITE;
/* * Sandybridge PPGTT errata: We need a global gtt mapping * for MI and pipe_control writes because the gpu doesn't * properly redirect them through the ppgtt for non_secure * batchbuffers.
*/ if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
GRAPHICS_VER(eb->i915) == 6 &&
!i915_vma_is_bound(target->vma, I915_VMA_GLOBAL_BIND)) { struct i915_vma *vma = target->vma;
/* * If the relocation already has the right value in it, no * more work needs to be done.
*/ if (!DBG_FORCE_RELOC &&
gen8_canonical_addr(i915_vma_offset(target->vma)) == reloc->presumed_offset) return 0;
/* Check that the relocation address is valid... */ if (unlikely(reloc->offset >
ev->vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) {
drm_dbg(&i915->drm, "Relocation beyond object bounds: " "target %d offset %d size %d.\n",
reloc->target_handle,
(int)reloc->offset,
(int)ev->vma->size); return -EINVAL;
} if (unlikely(reloc->offset & 3)) {
drm_dbg(&i915->drm, "Relocation not 4-byte aligned: " "target %d offset %d.\n",
reloc->target_handle,
(int)reloc->offset); return -EINVAL;
}
/* * If we write into the object, we need to force the synchronisation * barrier, either with an asynchronous clflush or if we executed the * patching using the GPU (though that should be serialised by the * timeline). To be completely sure, and since we are required to * do relocations we are already stalling, disable the user's opt * out of our synchronisation.
*/
ev->flags &= ~EXEC_OBJECT_ASYNC;
/* and update the user's relocation entry */ return relocate_entry(ev->vma, reloc, eb, target->vma);
}
if (unlikely(remain > N_RELOC(INT_MAX))) return -EINVAL;
/* * We must check that the entire relocation array is safe * to read. However, if the array is not writable the user loses * the updated relocation values.
*/ if (unlikely(!access_ok(urelocs, remain * sizeof(*urelocs)))) return -EFAULT;
/* * This is the fast path and we cannot handle a pagefault * whilst holding the struct mutex lest the user pass in the * relocations contained within a mmaped bo. For in such a case * we, the page fault handler would call i915_gem_fault() and * we would try to acquire the struct mutex again. Obviously * this is bad and so lockdep complains vehemently.
*/
pagefault_disable();
copied = __copy_from_user_inatomic(r, urelocs, count * sizeof(r[0]));
pagefault_enable(); if (unlikely(copied)) {
remain = -EFAULT; goto out;
}
if (likely(offset == 0)) {
} elseif ((s64)offset < 0) {
remain = (int)offset; goto out;
} else { /* * Note that reporting an error now * leaves everything in an inconsistent * state as we have *already* changed * the relocation value inside the * object. As we have not changed the * reloc.presumed_offset or will not * change the execobject.offset, on the * call we may not rewrite the value * inside the object, leaving it * dangling and causing a GPU hang. Unless * userspace dynamically rebuilds the * relocations on each execbuf rather than * presume a static tree. * * We did previously check if the relocations * were writable (access_ok), an error now * would be a strange race with mprotect, * having already demonstrated that we * can read from this userspace address.
*/
offset = gen8_canonical_addr(offset & ~UPDATE);
__put_user(offset,
&urelocs[r - stack].presumed_offset);
}
} while (r++, --count);
urelocs += ARRAY_SIZE(stack);
} while (remain);
out:
reloc_cache_reset(&eb->reloc_cache, eb); return remain;
}
/* * As we do not update the known relocation offsets after * relocating (due to the complexities in lock handling), * we need to mark them as invalid now so that we force the * relocation processing next time. Just in case the target * object is evicted and then rebound into its old * presumed_offset before the next execbuffer - if that * happened we would make the mistake of assuming that the * relocations were valid.
*/ if (!user_access_begin(urelocs, size)) goto end;
if (likely(!(eb->args->flags & __EXEC_USERPTR_USED))) return 0;
for (i = 0; i < count; i++) { struct eb_vma *ev = &eb->vma[i];
if (!i915_gem_object_is_userptr(ev->vma->obj)) continue;
ret = i915_gem_object_userptr_submit_init(ev->vma->obj); if (ret) return ret;
ev->flags |= __EXEC_OBJECT_USERPTR_INIT;
}
return 0;
}
static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb)
{ bool have_copy = false; struct eb_vma *ev; int err = 0;
repeat: if (signal_pending(current)) {
err = -ERESTARTSYS; goto out;
}
/* We may process another execbuffer during the unlock... */
eb_release_vmas(eb, false);
i915_gem_ww_ctx_fini(&eb->ww);
/* * We take 3 passes through the slowpatch. * * 1 - we try to just prefault all the user relocation entries and * then attempt to reuse the atomic pagefault disabled fast path again. * * 2 - we copy the user entries to a local buffer here outside of the * local and allow ourselves to wait upon any rendering before * relocations * * 3 - we already have a local copy of the relocation entries, but * were interrupted (EAGAIN) whilst waiting for the objects, try again.
*/ if (!err) {
err = eb_prefault_relocations(eb);
} elseif (!have_copy) {
err = eb_copy_relocations(eb);
have_copy = err == 0;
} else {
cond_resched();
err = 0;
}
if (!err)
err = eb_reinit_userptr(eb);
i915_gem_ww_ctx_init(&eb->ww, true); if (err) goto out;
/* reacquire the objects */
repeat_validate:
err = eb_pin_engine(eb, false); if (err) goto err;
err = eb_validate_vmas(eb); if (err) goto err;
GEM_BUG_ON(!eb->batches[0]);
list_for_each_entry(ev, &eb->relocs, reloc_link) { if (!have_copy) {
err = eb_relocate_vma(eb, ev); if (err) break;
} else {
err = eb_relocate_vma_slow(eb, ev); if (err) break;
}
}
if (err == -EDEADLK) goto err;
if (err && !have_copy) goto repeat;
if (err) goto err;
/* as last step, parse the command buffer */
err = eb_parse(eb); if (err) goto err;
/* * Leave the user relocations as are, this is the painfully slow path, * and we want to avoid the complication of dropping the lock whilst * having buffers reserved in the aperture and so causing spurious * ENOSPC for random operations.
*/
err: if (err == -EDEADLK) {
eb_release_vmas(eb, false);
err = i915_gem_ww_ctx_backoff(&eb->ww); if (!err) goto repeat_validate;
}
if (err == -EAGAIN) goto repeat;
out: if (have_copy) { constunsignedint count = eb->buffer_count; unsignedint i;
for (i = 0; i < count; i++) { conststruct drm_i915_gem_exec_object2 *entry =
&eb->exec[i]; struct drm_i915_gem_relocation_entry *relocs;
err: if (err == -EDEADLK) {
eb_release_vmas(eb, false);
err = i915_gem_ww_ctx_backoff(&eb->ww); if (!err) goto retry;
}
return err;
slow:
err = eb_relocate_parse_slow(eb); if (err) /* * If the user expects the execobject.offset and * reloc.presumed_offset to be an exact match, * as for using NO_RELOC, then we cannot update * the execobject.offset until we have completed * relocation.
*/
eb->args->flags &= ~__EXEC_HAS_RELOC;
return err;
}
/* * Using two helper loops for the order of which requests / batches are created * and added the to backend. Requests are created in order from the parent to * the last child. Requests are added in the reverse order, from the last child * to parent. This is done for locking reasons as the timeline lock is acquired * during request creation and released when the request is added to the * backend. To make lockdep happy (see intel_context_timeline_lock) this must be * the ordering.
*/ #define for_each_batch_create_order(_eb, _i) \ for ((_i) = 0; (_i) < (_eb)->num_batches; ++(_i)) #define for_each_batch_add_order(_eb, _i) \
BUILD_BUG_ON(!typecheck(int, _i)); \ for ((_i) = (_eb)->num_batches - 1; (_i) >= 0; --(_i))
staticstruct i915_request *
eb_find_first_request_added(struct i915_execbuffer *eb)
{ int i;
for_each_batch_add_order(eb, i) if (eb->requests[i]) return eb->requests[i];
GEM_BUG_ON("Request not found");
return NULL;
}
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
/* Stage with GFP_KERNEL allocations before we enter the signaling critical path */ staticint eb_capture_stage(struct i915_execbuffer *eb)
{ constunsignedint count = eb->buffer_count; unsignedint i = count, j;
/* * Release anything that didn't get committed due to errors. * The capture_list will otherwise be freed at request retire.
*/ staticvoid eb_capture_release(struct i915_execbuffer *eb)
{ unsignedint j;
/* * If the GPU is not _reading_ through the CPU cache, we need * to make sure that any writes (both previous GPU writes from * before a change in snooping levels and normal CPU writes) * caught in that cache are flushed to main memory. * * We want to say * obj->cache_dirty && * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) * but gcc's optimiser doesn't handle that as well and emits * two jumps instead of one. Maybe one day... * * FIXME: There is also sync flushing in set_pages(), which * serves a different purpose(some of the time at least). * * We should consider: * * 1. Rip out the async flush code. * * 2. Or make the sync flushing use the async clflush path * using mandatory fences underneath. Currently the below * async flush happens after we bind the object.
*/ if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { if (i915_gem_clflush_object(obj, 0))
flags &= ~EXEC_OBJECT_ASYNC;
}
/* We only need to await on the first request */ if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) {
err = i915_request_await_object
(eb_find_first_request_added(eb), obj,
flags & EXEC_OBJECT_WRITE);
}
for_each_batch_add_order(eb, j) { if (err) break; if (!eb->requests[j]) continue;
staticstruct i915_vma *eb_dispatch_secure(struct i915_execbuffer *eb, struct i915_vma *vma)
{ /* * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure * batch" bit. Hence we need to pin secure batches into the global gtt.
* hsw should have this fixed, but bdw mucks it up again. */ if (eb->batch_flags & I915_DISPATCH_SECURE) return i915_gem_object_ggtt_pin_ww(vma->obj, &eb->ww, NULL, 0, 0, PIN_VALIDATE);
if (intel_context_nopreempt(rq->context))
__set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags);
if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) {
err = i915_reset_gen7_sol_offsets(rq); if (err) return err;
}
/* * After we completed waiting for other engines (using HW semaphores) * then we can signal that this request/batch is ready to run. This * allows us to determine if the batch is still waiting on the GPU * or actually running by checking the breadcrumb.
*/ if (rq->context->engine->emit_init_breadcrumb) {
err = rq->context->engine->emit_init_breadcrumb(rq); if (err) return err;
}
/* * Find one BSD ring to dispatch the corresponding BSD command. * The engine index is returned.
*/ staticunsignedint
gen8_dispatch_bsd_engine(struct drm_i915_private *i915, struct drm_file *file)
{ struct drm_i915_file_private *file_priv = file->driver_priv;
/* Check whether the file_priv has already selected one ring. */ if ((int)file_priv->bsd_engine < 0)
file_priv->bsd_engine =
get_random_u32_below(i915->engine_uabi_class_count[I915_ENGINE_CLASS_VIDEO]);
/* * Completely unscientific finger-in-the-air estimates for suitable * maximum user request size (to avoid blocking) and then backoff.
*/ if (intel_ring_update_space(ring) >= PAGE_SIZE) return NULL;
/* * Find a request that after waiting upon, there will be at least half * the ring available. The hysteresis allows us to compete for the * shared ring and should mean that we sleep less often prior to * claiming our resources, but not so long that the ring completely * drains before we can submit our next request.
*/
list_for_each_entry(rq, &tl->requests, link) { if (rq->ring != ring) continue;
if (__intel_ring_space(rq->postfix,
ring->emit, ring->size) > ring->size / 2) break;
} if (&rq->link == &tl->requests) return NULL; /* weird, we will check again later for real */
/* * Take a local wakeref for preparing to dispatch the execbuf as * we expect to access the hardware fairly frequently in the * process, and require the engine to be kept awake between accesses. * Upon dispatch, we acquire another prolonged wakeref that we hold * until the timeline is idle, which in turn releases the wakeref * taken on the engine, and the parent device.
*/
tl = intel_context_timeline_lock(ce); if (IS_ERR(tl)) return PTR_ERR(tl);
intel_context_enter(ce); if (throttle)
rq = eb_throttle(eb, ce);
intel_context_timeline_unlock(tl);
if (rq) { bool nonblock = eb->file->filp->f_flags & O_NONBLOCK; long timeout = nonblock ? 0 : MAX_SCHEDULE_TIMEOUT;
if (i915_request_wait(rq, I915_WAIT_INTERRUPTIBLE,
timeout) < 0) {
i915_request_put(rq);
/* * Error path, cannot use intel_context_timeline_lock as * that is user interruptible and this clean up step * must be done.
*/
mutex_lock(&ce->timeline->mutex);
intel_context_exit(ce);
mutex_unlock(&ce->timeline->mutex);
if (nonblock) return -EWOULDBLOCK; else return -EINTR;
}
i915_request_put(rq);
}
return 0;
}
staticint eb_pin_engine(struct i915_execbuffer *eb, bool throttle)
{ struct intel_context *ce = eb->context, *child; int err; int i = 0, j = 0;
if (unlikely(intel_context_is_banned(ce))) return -EIO;
/* * Pinning the contexts may generate requests in order to acquire * GGTT space, so do this first before we reserve a seqno for * ourselves.
*/
err = intel_context_pin_ww(ce, &eb->ww); if (err) return err;
for_each_child(ce, child) {
err = intel_context_pin_ww(child, &eb->ww);
GEM_BUG_ON(err); /* perma-pinned should incr a counter */
}
if (user_ring_id != I915_EXEC_BSD &&
(args->flags & I915_EXEC_BSD_MASK)) {
drm_dbg(&i915->drm, "execbuf with non bsd ring but with invalid " "bsd dispatch flags: %d\n", (int)(args->flags)); return -1;
}
for_each_child(ce, child)
intel_context_get(child);
eb->wakeref = intel_gt_pm_get(ce->engine->gt); /* * Keep GT0 active on MTL so that i915_vma_parked() doesn't * free VMAs while execbuf ioctl is validating VMAs.
*/ if (gt->info.id)
eb->wakeref_gt0 = intel_gt_pm_get(to_gt(gt->i915));
if (!test_bit(CONTEXT_ALLOC_BIT, &ce->flags)) {
err = intel_context_alloc_state(ce); if (err) goto err;
}
for_each_child(ce, child) { if (!test_bit(CONTEXT_ALLOC_BIT, &child->flags)) {
err = intel_context_alloc_state(child); if (err) goto err;
}
}
/* * ABI: Before userspace accesses the GPU (e.g. execbuffer), report * EIO if the GPU is already wedged.
*/
err = intel_gt_terminally_wedged(ce->engine->gt); if (err) goto err;
if (!i915_vm_tryget(ce->vm)) {
err = -ENOENT; goto err;
}
eb->context = ce;
eb->gt = ce->engine->gt;
/* * Make sure engine pool stays alive even if we call intel_context_put * during ww handling. The pool is destroyed when last pm reference * is dropped, which breaks our -EDEADLK handling.
*/ return err;
err: if (gt->info.id)
intel_gt_pm_put(to_gt(gt->i915), eb->wakeref_gt0);
i915_vm_put(eb->context->vm); /* * This works in conjunction with eb_select_engine() to prevent * i915_vma_parked() from interfering while execbuf validates vmas.
*/ if (eb->gt->info.id)
intel_gt_pm_put(to_gt(eb->gt->i915), eb->wakeref_gt0);
intel_gt_pm_put(eb->context->engine->gt, eb->wakeref);
for_each_child(eb->context, child)
intel_context_put(child);
intel_context_put(eb->context);
}
if (!fence && user_fence.flags &&
!(user_fence.flags & I915_EXEC_FENCE_SIGNAL)) {
drm_dbg(&eb->i915->drm, "Syncobj handle has no fence\n");
drm_syncobj_put(syncobj); return -EINVAL;
}
if (fence)
err = dma_fence_chain_find_seqno(&fence, point);
if (err && !(user_fence.flags & I915_EXEC_FENCE_SIGNAL)) {
drm_dbg(&eb->i915->drm, "Syncobj handle missing requested point %llu\n",
point);
dma_fence_put(fence);
drm_syncobj_put(syncobj); return err;
}
/* * A point might have been signaled already and * garbage collected from the timeline. In this case * just ignore the point and carry on.
*/ if (!fence && !(user_fence.flags & I915_EXEC_FENCE_SIGNAL)) {
drm_syncobj_put(syncobj); continue;
}
/* * For timeline syncobjs we need to preallocate chains for * later signaling.
*/ if (point != 0 && user_fence.flags & I915_EXEC_FENCE_SIGNAL) { /* * Waiting and signaling the same point (when point != * 0) would break the timeline.
*/ if (user_fence.flags & I915_EXEC_FENCE_WAIT) {
drm_dbg(&eb->i915->drm, "Trying to wait & signal the same timeline point.\n");
dma_fence_put(fence);
drm_syncobj_put(syncobj); return -EINVAL;
}
/* Check that the context wasn't destroyed before submission */ if (likely(!intel_context_is_closed(eb->context))) {
attr = eb->gem_context->sched;
} else { /* Serialise with context_close via the add_to_timeline */
i915_request_set_error_once(rq, -ENOENT);
__i915_request_skip(rq);
err = -ENOENT; /* override any transient errors */
}
if (intel_context_is_parallel(eb->context)) { if (err) {
__i915_request_skip(rq);
set_bit(I915_FENCE_FLAG_SKIP_PARALLEL,
&rq->fence.flags);
} if (last_parallel)
set_bit(I915_FENCE_FLAG_SUBMIT_PARALLEL,
&rq->fence.flags);
}
__i915_request_queue(rq, &attr);
/* Try to clean up the client's timeline after submitting the request */ if (prev)
retire_requests(tl, prev);
mutex_unlock(&tl->mutex);
return err;
}
staticint eb_requests_add(struct i915_execbuffer *eb, int err)
{ int i;
/* * We iterate in reverse order of creation to release timeline mutexes in * same order.
*/
for_each_batch_add_order(eb, i) { struct i915_request *rq = eb->requests[i];
if (!rq) continue;
err |= eb_request_add(eb, rq, err, i == 0);
}
/* The execbuf2 extension mechanism reuses cliprects_ptr. So we cannot * have another flag also using it at the same time.
*/ if (eb->args->flags & I915_EXEC_FENCE_ARRAY) return -EINVAL;
for_each_batch_create_order(eb, i) { /* Allocate a request for this batch buffer nice and early. */
eb->requests[i] = i915_request_create(eb_find_context(eb, i)); if (IS_ERR(eb->requests[i])) {
out_fence = ERR_CAST(eb->requests[i]);
eb->requests[i] = NULL; return out_fence;
}
/* * Only the first request added (committed to backend) has to * take the in fences into account as all subsequent requests * will have fences inserted inbetween them.
*/ if (i + 1 == eb->num_batches) {
out_fence = eb_fences_add(eb, eb->requests[i],
in_fence, out_fence_fd); if (IS_ERR(out_fence)) return out_fence;
}
/* * Not really on stack, but we don't want to call * kfree on the batch_snapshot when we put it, so use the * _onstack interface.
*/ if (eb->batches[i]->vma)
eb->requests[i]->batch_res =
i915_vma_resource_get(eb->batches[i]->vma->resource); if (eb->batch_pool) {
GEM_BUG_ON(intel_context_is_parallel(eb->context));
intel_gt_buffer_pool_mark_active(eb->batch_pool,
eb->requests[i]);
}
}
err = eb_relocate_parse(&eb); if (err) { /* * If the user expects the execobject.offset and * reloc.presumed_offset to be an exact match, * as for using NO_RELOC, then we cannot update * the execobject.offset until we have completed * relocation.
*/
args->flags &= ~__EXEC_HAS_RELOC; goto err_vma;
}
ww_acquire_done(&eb.ww.ctx);
err = eb_capture_stage(&eb); if (err) goto err_vma;
/* * When using LUT_HANDLE, we impose a limit of INT_MAX for the lookup * array size (see eb_create()). Otherwise, we can accept an array as * large as can be addressed (though use large arrays at your peril)!
*/
/* * Now that we have begun execution of the batchbuffer, we ignore * any new error after this point. Also given that we have already * updated the associated relocations, we try to write out the current * object locations irrespective of any error.
*/ if (args->flags & __EXEC_HAS_RELOC) { struct drm_i915_gem_exec_object2 __user *user_exec_list =
u64_to_user_ptr(args->buffers_ptr); unsignedint i;
/* Copy the new buffer offsets back to the user's exec list. */ /* * Note: count * sizeof(*user_exec_list) does not overflow, * because we checked 'count' in check_buffer_count(). * * And this range already got effectively checked earlier * when we did the "copy_from_user()" above.
*/ if (!user_write_access_begin(user_exec_list,
count * sizeof(*user_exec_list))) goto end;
for (i = 0; i < args->buffer_count; i++) { if (!(exec2_list[i].offset & UPDATE)) continue;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.