/** This vma's place in the execbuf reservation list */ struct drm_i915_gem_exec_object2 *exec; struct list_head bind_link; struct list_head reloc_link;
struct hlist_node node;
u32 handle;
};
enum {
FORCE_CPU_RELOC = 1,
FORCE_GTT_RELOC,
FORCE_GPU_RELOC, #define DBG_FORCE_RELOC 0 /* choose one of the above! */
};
/* __EXEC_OBJECT_ flags > BIT(29) defined in i915_vma.h */ #define __EXEC_OBJECT_HAS_PIN BIT(29) #define __EXEC_OBJECT_HAS_FENCE BIT(28) #define __EXEC_OBJECT_USERPTR_INIT BIT(27) #define __EXEC_OBJECT_NEEDS_MAP BIT(26) #define __EXEC_OBJECT_NEEDS_BIAS BIT(25) #define __EXEC_OBJECT_INTERNAL_FLAGS (~0u << 25) /* all of the above + */ #define __EXEC_OBJECT_RESERVED (__EXEC_OBJECT_HAS_PIN | __EXEC_OBJECT_HAS_FENCE)
/* Catch emission of unexpected errors for CI! */ #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM) #undef EINVAL #define EINVAL ({ \
DRM_DEBUG_DRIVER("EINVAL at %s:%d\n", __func__, __LINE__); \
22; \
}) #endif
/** * DOC: User command execution * * Userspace submits commands to be executed on the GPU as an instruction * stream within a GEM object we call a batchbuffer. This instructions may * refer to other GEM objects containing auxiliary state such as kernels, * samplers, render targets and even secondary batchbuffers. Userspace does * not know where in the GPU memory these objects reside and so before the * batchbuffer is passed to the GPU for execution, those addresses in the * batchbuffer and auxiliary objects are updated. This is known as relocation, * or patching. To try and avoid having to relocate each object on the next * execution, userspace is told the location of those objects in this pass, * but this remains just a hint as the kernel may choose a new location for * any object in the future. * * At the level of talking to the hardware, submitting a batchbuffer for the * GPU to execute is to add content to a buffer from which the HW * command streamer is reading. * * 1. Add a command to load the HW context. For Logical Ring Contexts, i.e. * Execlists, this command is not placed on the same buffer as the * remaining items. * * 2. Add a command to invalidate caches to the buffer. * * 3. Add a batchbuffer start command to the buffer; the start command is * essentially a token together with the GPU address of the batchbuffer * to be executed. * * 4. Add a pipeline flush to the buffer. * * 5. Add a memory write command to the buffer to record when the GPU * is done executing the batchbuffer. The memory write writes the * global sequence number of the request, ``i915_request::global_seqno``; * the i915 driver uses the current value in the register to determine * if the GPU has completed the batchbuffer. * * 6. Add a user interrupt command to the buffer. This command instructs * the GPU to issue an interrupt when the command, pipeline flush and * memory write are completed. * * 7. Inform the hardware of the additional commands added to the buffer * (by updating the tail pointer). * * Processing an execbuf ioctl is conceptually split up into a few phases. * * 1. Validation - Ensure all the pointers, handles and flags are valid. * 2. Reservation - Assign GPU address space for every object * 3. Relocation - Update any addresses to point to the final locations * 4. Serialisation - Order the request with respect to its dependencies * 5. Construction - Construct a request to execute the batchbuffer * 6. Submission (at some point in the future execution) * * Reserving resources for the execbuf is the most complicated phase. We * neither want to have to migrate the object in the address space, nor do * we want to have to update any relocations pointing to this object. Ideally, * we want to leave the object where it is and for all the existing relocations * to match. If the object is given a new address, or if userspace thinks the * object is elsewhere, we have to parse all the relocation entries and update * the addresses. Userspace can set the I915_EXEC_NORELOC flag to hint that * all the target addresses in all of its objects match the value in the * relocation entries and that they all match the presumed offsets given by the * list of execbuffer objects. Using this knowledge, we know that if we haven't * moved any buffers, all the relocation entries are valid and we can skip * the update. (If userspace is wrong, the likely outcome is an impromptu GPU * hang.) The requirement for using I915_EXEC_NO_RELOC are: * * The addresses written in the objects must match the corresponding * reloc.presumed_offset which in turn must match the corresponding * execobject.offset. * * Any render targets written to in the batch must be flagged with * EXEC_OBJECT_WRITE. * * To avoid stalling, execobject.offset should match the current * address of that object within the active context. * * The reservation is done is multiple phases. First we try and keep any * object already bound in its current location - so as long as meets the * constraints imposed by the new execbuffer. Any object left unbound after the * first pass is then fitted into any available idle space. If an object does * not fit, all objects are removed from the reservation and the process rerun * after sorting the objects into a priority order (more difficult to fit * objects are tried first). Failing that, the entire VM is cleared and we try * to fit the execbuf once last time before concluding that it simply will not * fit. * * A small complication to all of this is that we allow userspace not only to * specify an alignment and a size for the object in the address space, but * we also allow userspace to specify the exact offset. This objects are * simpler to place (the location is known a priori) all we have to do is make * sure the space is available. * * Once all the objects are in place, patching up the buried pointers to point * to the final locations is a fairly simple job of walking over the relocation * entry arrays, looking up the right address and rewriting the value into * the object. Simple! ... The relocation entries are stored in user memory * and so to access them we have to copy them into a local buffer. That copy * has to avoid taking any pagefaults as they may lead back to a GEM object * requiring the struct_mutex (i.e. recursive deadlock). So once again we split * the relocation into multiple passes. First we try to do everything within an * atomic context (avoid the pagefaults) which requires that we never wait. If * we detect that we may wait, or if we need to fault, then we have to fallback * to a slower path. The slowpath has to drop the mutex. (Can you hear alarm * bells yet?) Dropping the mutex means that we lose all the state we have * built up so far for the execbuf and we must reset any global data. However, * we do leave the objects pinned in their final locations - which is a * potential issue for concurrent execbufs. Once we have left the mutex, we can * allocate and copy all the relocation entries into a large array at our * leisure, reacquire the mutex, reclaim all the objects and other state and * then proceed to update any incorrect addresses with the objects. * * As we process the relocation entries, we maintain a record of whether the * object is being written to. Using NORELOC, we expect userspace to provide * this information instead. We also check whether we can skip the relocation * by comparing the expected value inside the relocation entry with the target's * final address. If they differ, we have to map the current object and rewrite * the 4 or 8 byte pointer within. * * Serialising an execbuf is quite simple according to the rules of the GEM * ABI. Execution within each context is ordered by the order of submission. * Writes to any GEM object are in order of submission and are exclusive. Reads * from a GEM object are unordered with respect to other reads, but ordered by * writes. A write submitted after a read cannot occur before the read, and * similarly any read submitted after a write cannot occur before the write. * Writes are ordered between engines such that only one write occurs at any * time (completing any reads beforehand) - using semaphores where available * and CPU serialisation otherwise. Other GEM access obey the same rules, any * write (either via mmaps using set-domain, or via pwrite) must flush all GPU * reads before starting, and any read (either using set-domain or pread) must * flush all GPU writes before starting. (Note we only employ a barrier before, * we currently rely on userspace not concurrently starting a new execution * whilst reading or writing to an object. This may be an advantage or not * depending on how much you trust userspace not to shoot themselves in the * foot.) Serialisation may just result in the request being inserted into * a DAG awaiting its turn, but most simple is to wait on the CPU until * all dependencies are resolved. * * After all of that, is just a matter of closing the request and handing it to * the hardware (well, leaving it in a queue to be executed). However, we also * offer the ability for batchbuffers to be run with elevated privileges so * that they access otherwise hidden registers. (Used to adjust L3 cache etc.) * Before any batch is given extra privileges we first must check that it * contains no nefarious instructions, we check that each instruction is from * our whitelist and all registers are also from an allowed list. We first * copy the user's batchbuffer to a shadow (so that the user doesn't have * access to it, either by the CPU or GPU as we scan it) and then parse each * instruction. If everything is ok, we set a flag telling the hardware to run * the batchbuffer in trusted mode, otherwise the ioctl is rejected.
*/
struct eb_fence { struct drm_syncobj *syncobj; /* Use with ptr_mask_bits() */ struct dma_fence *dma_fence;
u64 value; struct dma_fence_chain *chain_fence;
};
struct intel_gt *gt; /* gt for the execbuf */ struct intel_context *context; /* logical state for the request */ struct i915_gem_context *gem_context; /** caller's context */
intel_wakeref_t wakeref;
intel_wakeref_t wakeref_gt0;
/** our requests to build */ struct i915_request *requests[MAX_ENGINE_INSTANCE + 1]; /** identity of the batch obj/vma */ struct eb_vma *batches[MAX_ENGINE_INSTANCE + 1]; struct i915_vma *trampoline; /** trampoline used for chaining */
/** used for excl fence in dma_resv objects when > 1 BB submitted */ struct dma_fence *composite_fence;
/** actual size of execobj[] as we may extend it for the cmdparser */ unsignedint buffer_count;
/* number of batches in execbuf IOCTL */ unsignedint num_batches;
/** list of vma not yet bound during reservation phase */ struct list_head unbound;
/** list of vma that have execobj.relocation_count */ struct list_head relocs;
struct i915_gem_ww_ctx ww;
/** * Track the most recently used object for relocations, as we * frequently have to perform multiple relocations within the same * obj/page
*/ struct reloc_cache { struct drm_mm_node node; /** temporary GTT binding */ unsignedlong vaddr; /** Current kmap address */ unsignedlong page; /** Currently mapped page index */ unsignedint graphics_ver; /** Cached value of GRAPHICS_VER */ bool use_64bit_reloc : 1; bool has_llc : 1; bool has_fence : 1; bool needs_unfenced : 1;
} reloc_cache;
u64 invalid_flags; /** Set of execobj.flags that are invalid */
/** Length of batch within object */
u64 batch_len[MAX_ENGINE_INSTANCE + 1];
u32 batch_start_offset; /** Location within object of batch */
u32 batch_flags; /** Flags composed for emit_bb_start() */ struct intel_gt_buffer_pool_node *batch_pool; /** pool node for batch buffer */
/** * Indicate either the size of the hashtable used to resolve * relocation handles, or if negative that we are using a direct * index into the execobj[].
*/ int lut_size; struct hlist_head *buckets; /** ht for relocation handles */
/* * Without a 1:1 association between relocation handles and * the execobject[] index, we instead create a hashtable. * We size it dynamically based on available memory, starting * first with 1:1 associative hash and scaling back until * the allocation succeeds. * * Later on we use a positive lut_size to indicate we are * using this hashtable, and a negative value to indicate a * direct lookup.
*/ do {
gfp_t flags;
/* While we can still reduce the allocation size, don't * raise a warning and allow the allocation to fail. * On the last pass though, we want to try as hard * as possible to perform the allocation and warn * if it fails.
*/
flags = GFP_KERNEL; if (size > 1)
flags |= __GFP_NORETRY | __GFP_NOWARN;
eb->buckets = kzalloc(sizeof(struct hlist_head) << size,
flags); if (eb->buckets) break;
} while (--size);
if (exec_flags & EXEC_OBJECT_NEEDS_GTT)
pin_flags |= PIN_GLOBAL;
/* * Wa32bitGeneralStateOffset & Wa32bitInstructionBaseOffset, * limit address to the first 4GBs for unflagged objects.
*/ if (!(exec_flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS))
pin_flags |= PIN_ZONE_4G;
if (exec_flags & __EXEC_OBJECT_NEEDS_MAP)
pin_flags |= PIN_MAPPABLE;
/* Attempt to reuse the current location if available */
err = i915_vma_pin_ww(vma, &eb->ww, 0, 0, pin_flags); if (err == -EDEADLK) return err;
if (unlikely(err)) { if (entry->flags & EXEC_OBJECT_PINNED) return err;
/* Failing that pick any _free_ space if suitable */
err = i915_vma_pin_ww(vma, &eb->ww,
entry->pad_to_size,
entry->alignment,
eb_pin_flags(entry, ev->flags) |
PIN_USER | PIN_NOEVICT | PIN_VALIDATE); if (unlikely(err)) return err;
}
if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_FENCE)) {
err = i915_vma_pin_fence(vma); if (unlikely(err)) return err;
if (vma->fence)
ev->flags |= __EXEC_OBJECT_HAS_FENCE;
}
ev->flags |= __EXEC_OBJECT_HAS_PIN; if (eb_vma_misplaced(entry, vma, ev->flags)) return -EBADSLT;
return 0;
}
staticvoid
eb_unreserve_vma(struct eb_vma *ev)
{ if (unlikely(ev->flags & __EXEC_OBJECT_HAS_FENCE))
__i915_vma_unpin_fence(ev->vma);
ev->flags &= ~__EXEC_OBJECT_RESERVED;
}
staticint
eb_validate_vma(struct i915_execbuffer *eb, struct drm_i915_gem_exec_object2 *entry, struct i915_vma *vma)
{ /* Relocations are disallowed for all platforms after TGL-LP. This * also covers all platforms with local memory.
*/ if (entry->relocation_count &&
GRAPHICS_VER(eb->i915) >= 12 && !IS_TIGERLAKE(eb->i915)) return -EINVAL;
if (unlikely(entry->flags & eb->invalid_flags)) return -EINVAL;
if (unlikely(entry->alignment &&
!is_power_of_2_u64(entry->alignment))) return -EINVAL;
/* * Offset can be used as input (EXEC_OBJECT_PINNED), reject * any non-page-aligned or non-canonical addresses.
*/ if (unlikely(entry->flags & EXEC_OBJECT_PINNED &&
entry->offset != gen8_canonical_addr(entry->offset & I915_GTT_PAGE_MASK))) return -EINVAL;
/* pad_to_size was once a reserved field, so sanitize it */ if (entry->flags & EXEC_OBJECT_PAD_TO_SIZE) { if (unlikely(offset_in_page(entry->pad_to_size))) return -EINVAL;
} else {
entry->pad_to_size = 0;
} /* * From drm_mm perspective address space is continuous, * so from this point we're always using non-canonical * form internally.
*/
entry->offset = gen8_noncanonical_addr(entry->offset);
if (entry->relocation_count)
list_add_tail(&ev->reloc_link, &eb->relocs);
/* * SNA is doing fancy tricks with compressing batch buffers, which leads * to negative relocation deltas. Usually that works out ok since the * relocate address is still positive, except when the batch is placed * very low in the GTT. Ensure this doesn't happen. * * Note that actual hangs have only been observed on gen7, but for * paranoia do it everywhere.
*/ if (is_batch_buffer(eb, i)) { if (entry->relocation_count &&
!(ev->flags & EXEC_OBJECT_PINNED))
ev->flags |= __EXEC_OBJECT_NEEDS_BIAS; if (eb->reloc_cache.has_fence)
ev->flags |= EXEC_OBJECT_NEEDS_FENCE;
eb->batches[*current_batch] = ev;
if (unlikely(ev->flags & EXEC_OBJECT_WRITE)) {
drm_dbg(&i915->drm, "Attempting to use self-modifying batch buffer\n"); return -EINVAL;
}
if (range_overflows_t(u64,
eb->batch_start_offset,
eb->args->batch_len,
ev->vma->size)) {
drm_dbg(&i915->drm, "Attempting to use out-of-bounds batch\n"); return -EINVAL;
}
if (DBG_FORCE_RELOC == FORCE_CPU_RELOC) returntrue;
if (DBG_FORCE_RELOC == FORCE_GTT_RELOC) returnfalse;
/* * For objects created by userspace through GEM_CREATE with pat_index * set by set_pat extension, i915_gem_object_has_cache_level() always * return true, otherwise the call would fall back to checking whether * the object is un-cached.
*/ return (cache->has_llc ||
obj->cache_dirty ||
!i915_gem_object_has_cache_level(obj, I915_CACHE_NONE));
}
/* * We have one more buffers that we couldn't bind, which could be due to * various reasons. To resolve this we have 4 passes, with every next * level turning the screws tighter: * * 0. Unbind all objects that do not match the GTT constraints for the * execbuffer (fenceable, mappable, alignment etc). Bind all new * objects. This avoids unnecessary unbinding of later objects in order * to make room for the earlier objects *unless* we need to defragment. * * 1. Reorder the buffers, where objects with the most restrictive * placement requirements go first (ignoring fixed location buffers for * now). For example, objects needing the mappable aperture (the first * 256M of GTT), should go first vs objects that can be placed just * about anywhere. Repeat the previous pass. * * 2. Consider buffers that are pinned at a fixed location. Also try to * evict the entire VM this time, leaving only objects that we were * unable to lock. Try again to bind the buffers. (still using the new * buffer order). * * 3. We likely have object lock contention for one or more stubborn * objects in the VM, for which we need to evict to make forward * progress (perhaps we are fighting the shrinker?). When evicting the * VM this time around, anything that we can't lock we now track using * the busy_bo, using the full lock (after dropping the vm->mutex to * prevent deadlocks), instead of trylock. We then continue to evict the * VM, this time with the stubborn object locked, which we can now * hopefully unbind (if still bound in the VM). Repeat until the VM is * evicted. Finally we should be able bind everything.
*/ for (pass = 0; pass <= 3; pass++) { int pin_flags = PIN_USER | PIN_VALIDATE;
if (pass == 0)
pin_flags |= PIN_NONBLOCK;
if (pass >= 1)
eb_unbind(eb, pass >= 2);
if (pass == 2) {
err = mutex_lock_interruptible(&eb->context->vm->mutex); if (!err) {
err = i915_gem_evict_vm(eb->context->vm, &eb->ww, NULL);
mutex_unlock(&eb->context->vm->mutex);
} if (err) return err;
}
if (pass == 3) {
retry:
err = mutex_lock_interruptible(&eb->context->vm->mutex); if (!err) { struct drm_i915_gem_object *busy_bo = NULL;
err = i915_gem_evict_vm(eb->context->vm, &eb->ww, &busy_bo);
mutex_unlock(&eb->context->vm->mutex); if (err && busy_bo) {
err = i915_gem_object_lock(busy_bo, &eb->ww);
i915_gem_object_put(busy_bo); if (!err) goto retry;
}
} if (err) return err;
}
lut = i915_lut_handle_alloc(); if (unlikely(!lut)) return -ENOMEM;
i915_vma_get(vma); if (!atomic_fetch_inc(&vma->open_count))
i915_vma_reopen(vma);
lut->handle = handle;
lut->ctx = ctx;
/* Check that the context hasn't been closed in the meantime */
err = -EINTR; if (!mutex_lock_interruptible(&ctx->lut_mutex)) { if (likely(!i915_gem_context_is_closed(ctx)))
err = radix_tree_insert(&ctx->handles_vma, handle, vma); else
err = -ENOENT; if (err == 0) { /* And nor has this handle */ struct drm_i915_gem_object *obj = vma->obj;
do { struct drm_i915_gem_object *obj; struct i915_vma *vma; int err;
rcu_read_lock();
vma = radix_tree_lookup(&eb->gem_context->handles_vma, handle); if (likely(vma && vma->vm == vm))
vma = i915_vma_tryget(vma);
rcu_read_unlock(); if (likely(vma)) return vma;
obj = i915_gem_object_lookup(eb->file, handle); if (unlikely(!obj)) return ERR_PTR(-ENOENT);
/* * If the user has opted-in for protected-object tracking, make * sure the object encryption can be used. * We only need to do this when the object is first used with * this context, because the context itself will be banned when * the protected objects become invalid.
*/ if (i915_gem_context_uses_protected_content(eb->gem_context) &&
i915_gem_object_is_protected(obj)) {
err = intel_pxp_key_check(intel_bo_to_drm_bo(obj), true); if (err) {
i915_gem_object_put(obj); return ERR_PTR(err);
}
}
err = eb_add_vma(eb, ¤t_batch, i, vma); if (err) return err;
if (i915_gem_object_is_userptr(vma->obj)) {
err = i915_gem_object_userptr_submit_init(vma->obj); if (err) { if (i + 1 < eb->buffer_count) { /* * Execbuffer code expects last vma entry to be NULL, * since we already initialized this entry, * set the next value to NULL or we mess up * cleanup handling.
*/
eb->vma[i + 1].vma = NULL;
}
if (cache->vaddr) {
intel_gt_flush_ggtt_writes(ggtt->vm.gt);
io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr));
} else { struct i915_vma *vma = ERR_PTR(-ENODEV); int err;
if (i915_gem_object_is_tiled(obj)) return ERR_PTR(-EINVAL);
if (use_cpu_reloc(cache, obj)) return NULL;
err = i915_gem_object_set_to_gtt_domain(obj, true); if (err) return ERR_PTR(err);
/* * i915_gem_object_ggtt_pin_ww may attempt to remove the batch * VMA from the object list because we no longer pin. * * Only attempt to pin the batch buffer to ggtt if the current batch * is not inside ggtt, or the batch buffer is not misplaced.
*/ if (!i915_is_ggtt(batch->vm) ||
!i915_vma_misplaced(batch, 0, 0, PIN_MAPPABLE)) {
vma = i915_gem_object_ggtt_pin_ww(obj, &eb->ww, NULL, 0, 0,
PIN_MAPPABLE |
PIN_NONBLOCK /* NOWARN */ |
PIN_NOEVICT);
}
if (vma == ERR_PTR(-EDEADLK)) return vma;
if (IS_ERR(vma)) {
memset(&cache->node, 0, sizeof(cache->node));
mutex_lock(&ggtt->vm.mutex);
err = drm_mm_insert_node_in_range
(&ggtt->vm.mm, &cache->node,
PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE,
0, ggtt->mappable_end,
DRM_MM_INSERT_LOW);
mutex_unlock(&ggtt->vm.mutex); if (err) /* no inactive aperture space, use cpu reloc */ return NULL;
} else {
cache->node.start = i915_ggtt_offset(vma);
cache->node.mm = (void *)vma;
}
}
/* * Writes to the same cacheline are serialised by the CPU * (including clflush). On the write path, we only require * that it hits memory in an orderly fashion and place * mb barriers at the start and end of the relocation phase * to ensure ordering of clflush wrt to the system.
*/ if (flushes & CLFLUSH_AFTER)
drm_clflush_virt_range(addr, sizeof(*addr));
} else
*addr = value;
}
/* we've already hold a reference to all valid objects */
target = eb_get_vma(eb, reloc->target_handle); if (unlikely(!target)) return -ENOENT;
/* Validate that the target is in a valid r/w GPU domain */ if (unlikely(reloc->write_domain & (reloc->write_domain - 1))) {
drm_dbg(&i915->drm, "reloc with multiple write domains: " "target %d offset %d " "read %08x write %08x\n",
reloc->target_handle,
(int) reloc->offset,
reloc->read_domains,
reloc->write_domain); return -EINVAL;
} if (unlikely((reloc->write_domain | reloc->read_domains)
& ~I915_GEM_GPU_DOMAINS)) {
drm_dbg(&i915->drm, "reloc with read/write non-GPU domains: " "target %d offset %d " "read %08x write %08x\n",
reloc->target_handle,
(int) reloc->offset,
reloc->read_domains,
reloc->write_domain); return -EINVAL;
}
if (reloc->write_domain) {
target->flags |= EXEC_OBJECT_WRITE;
/* * Sandybridge PPGTT errata: We need a global gtt mapping * for MI and pipe_control writes because the gpu doesn't * properly redirect them through the ppgtt for non_secure * batchbuffers.
*/ if (reloc->write_domain == I915_GEM_DOMAIN_INSTRUCTION &&
GRAPHICS_VER(eb->i915) == 6 &&
!i915_vma_is_bound(target->vma, I915_VMA_GLOBAL_BIND)) { struct i915_vma *vma = target->vma;
/* * If the relocation already has the right value in it, no * more work needs to be done.
*/ if (!DBG_FORCE_RELOC &&
gen8_canonical_addr(i915_vma_offset(target->vma)) == reloc->presumed_offset) return 0;
/* Check that the relocation address is valid... */ if (unlikely(reloc->offset >
ev->vma->size - (eb->reloc_cache.use_64bit_reloc ? 8 : 4))) {
drm_dbg(&i915->drm, "Relocation beyond object bounds: " "target %d offset %d size %d.\n",
reloc->target_handle,
(int)reloc->offset,
(int)ev->vma->size); return -EINVAL;
} if (unlikely(reloc->offset & 3)) {
drm_dbg(&i915->drm, "Relocation not 4-byte aligned: " "target %d offset %d.\n",
reloc->target_handle,
(int)reloc->offset); return -EINVAL;
}
/* * If we write into the object, we need to force the synchronisation * barrier, either with an asynchronous clflush or if we executed the * patching using the GPU (though that should be serialised by the * timeline). To be completely sure, and since we are required to * do relocations we are already stalling, disable the user's opt * out of our synchronisation.
*/
ev->flags &= ~EXEC_OBJECT_ASYNC;
/* and update the user's relocation entry */ return relocate_entry(ev->vma, reloc, eb, target->vma);
}
if (unlikely(remain > N_RELOC(INT_MAX))) return -EINVAL;
/* * We must check that the entire relocation array is safe * to read. However, if the array is not writable the user loses * the updated relocation values.
*/ if (unlikely(!access_ok(urelocs, remain * sizeof(*urelocs)))) return -EFAULT;
/* * This is the fast path and we cannot handle a pagefault * whilst holding the struct mutex lest the user pass in the * relocations contained within a mmaped bo. For in such a case * we, the page fault handler would call i915_gem_fault() and * we would try to acquire the struct mutex again. Obviously * this is bad and so lockdep complains vehemently.
*/
pagefault_disable();
copied = __copy_from_user_inatomic(r, urelocs, count * sizeof(r[0]));
pagefault_enable(); if (unlikely(copied)) {
remain = -EFAULT; goto out;
}
if (likely(offset == 0)) {
} elseif ((s64)offset < 0) {
remain = (int)offset; goto out;
} else { /* * Note that reporting an error now * leaves everything in an inconsistent * state as we have *already* changed * the relocation value inside the * object. As we have not changed the * reloc.presumed_offset or will not * change the execobject.offset, on the * call we may not rewrite the value * inside the object, leaving it * dangling and causing a GPU hang. Unless * userspace dynamically rebuilds the * relocations on each execbuf rather than * presume a static tree. * * We did previously check if the relocations * were writable (access_ok), an error now * would be a strange race with mprotect, * having already demonstrated that we * can read from this userspace address.
*/
offset = gen8_canonical_addr(offset & ~UPDATE);
__put_user(offset,
&urelocs[r - stack].presumed_offset);
}
} while (r++, --count);
urelocs += ARRAY_SIZE(stack);
} while (remain);
out:
reloc_cache_reset(&eb->reloc_cache, eb); return remain;
}
/* * As we do not update the known relocation offsets after * relocating (due to the complexities in lock handling), * we need to mark them as invalid now so that we force the * relocation processing next time. Just in case the target * object is evicted and then rebound into its old * presumed_offset before the next execbuffer - if that * happened we would make the mistake of assuming that the * relocations were valid.
*/ if (!user_access_begin(urelocs, size)) goto end;
if (likely(!(eb->args->flags & __EXEC_USERPTR_USED))) return 0;
for (i = 0; i < count; i++) { struct eb_vma *ev = &eb->vma[i];
if (!i915_gem_object_is_userptr(ev->vma->obj)) continue;
ret = i915_gem_object_userptr_submit_init(ev->vma->obj); if (ret) return ret;
ev->flags |= __EXEC_OBJECT_USERPTR_INIT;
}
return 0;
}
static noinline int eb_relocate_parse_slow(struct i915_execbuffer *eb)
{ bool have_copy = false; struct eb_vma *ev; int err = 0;
repeat: if (signal_pending(current)) {
err = -ERESTARTSYS; goto out;
}
/* We may process another execbuffer during the unlock... */
eb_release_vmas(eb, false);
i915_gem_ww_ctx_fini(&eb->ww);
/* * We take 3 passes through the slowpatch. * * 1 - we try to just prefault all the user relocation entries and * then attempt to reuse the atomic pagefault disabled fast path again. * * 2 - we copy the user entries to a local buffer here outside of the * local and allow ourselves to wait upon any rendering before * relocations * * 3 - we already have a local copy of the relocation entries, but * were interrupted (EAGAIN) whilst waiting for the objects, try again.
*/ if (!err) {
err = eb_prefault_relocations(eb);
} elseif (!have_copy) {
err = eb_copy_relocations(eb);
have_copy = err == 0;
} else {
cond_resched();
err = 0;
}
if (!err)
err = eb_reinit_userptr(eb);
i915_gem_ww_ctx_init(&eb->ww, true); if (err) goto out;
/* reacquire the objects */
repeat_validate:
err = eb_pin_engine(eb, false); if (err) goto err;
err = eb_validate_vmas(eb); if (err) goto err;
GEM_BUG_ON(!eb->batches[0]);
list_for_each_entry(ev, &eb->relocs, reloc_link) { if (!have_copy) {
err = eb_relocate_vma(eb, ev); if (err) break;
} else {
err = eb_relocate_vma_slow(eb, ev); if (err) break;
}
}
if (err == -EDEADLK) goto err;
if (err && !have_copy) goto repeat;
if (err) goto err;
/* as last step, parse the command buffer */
err = eb_parse(eb); if (err) goto err;
/* * Leave the user relocations as are, this is the painfully slow path, * and we want to avoid the complication of dropping the lock whilst * having buffers reserved in the aperture and so causing spurious * ENOSPC for random operations.
*/
err: if (err == -EDEADLK) {
eb_release_vmas(eb, false);
err = i915_gem_ww_ctx_backoff(&eb->ww); if (!err) goto repeat_validate;
}
if (err == -EAGAIN) goto repeat;
out: if (have_copy) { constunsignedint count = eb->buffer_count; unsignedint i;
for (i = 0; i < count; i++) { conststruct drm_i915_gem_exec_object2 *entry =
&eb->exec[i]; struct drm_i915_gem_relocation_entry *relocs;
err: if (err == -EDEADLK) {
eb_release_vmas(eb, false);
err = i915_gem_ww_ctx_backoff(&eb->ww); if (!err) goto retry;
}
return err;
slow:
err = eb_relocate_parse_slow(eb); if (err) /* * If the user expects the execobject.offset and * reloc.presumed_offset to be an exact match, * as for using NO_RELOC, then we cannot update * the execobject.offset until we have completed * relocation.
*/
eb->args->flags &= ~__EXEC_HAS_RELOC;
return err;
}
/* * Using two helper loops for the order of which requests / batches are created * and added the to backend. Requests are created in order from the parent to * the last child. Requests are added in the reverse order, from the last child * to parent. This is done for locking reasons as the timeline lock is acquired * during request creation and released when the request is added to the * backend. To make lockdep happy (see intel_context_timeline_lock) this must be * the ordering.
*/ #define for_each_batch_create_order(_eb, _i) \ for ((_i) = 0; (_i) < (_eb)->num_batches; ++(_i)) #define for_each_batch_add_order(_eb, _i) \
BUILD_BUG_ON(!typecheck(int, _i)); \ for ((_i) = (_eb)->num_batches - 1; (_i) >= 0; --(_i))
staticstruct i915_request *
eb_find_first_request_added(struct i915_execbuffer *eb)
{ int i;
for_each_batch_add_order(eb, i) if (eb->requests[i]) return eb->requests[i];
GEM_BUG_ON("Request not found");
return NULL;
}
#if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
/* Stage with GFP_KERNEL allocations before we enter the signaling critical path */ staticint eb_capture_stage(struct i915_execbuffer *eb)
{ constunsignedint count = eb->buffer_count; unsignedint i = count, j;
/* * Release anything that didn't get committed due to errors. * The capture_list will otherwise be freed at request retire.
*/ staticvoid eb_capture_release(struct i915_execbuffer *eb)
{ unsignedint j;
/* * If the GPU is not _reading_ through the CPU cache, we need * to make sure that any writes (both previous GPU writes from * before a change in snooping levels and normal CPU writes) * caught in that cache are flushed to main memory. * * We want to say * obj->cache_dirty && * !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ) * but gcc's optimiser doesn't handle that as well and emits * two jumps instead of one. Maybe one day... * * FIXME: There is also sync flushing in set_pages(), which * serves a different purpose(some of the time at least). * * We should consider: * * 1. Rip out the async flush code. * * 2. Or make the sync flushing use the async clflush path * using mandatory fences underneath. Currently the below * async flush happens after we bind the object.
*/ if (unlikely(obj->cache_dirty & ~obj->cache_coherent)) { if (i915_gem_clflush_object(obj, 0))
flags &= ~EXEC_OBJECT_ASYNC;
}
/* We only need to await on the first request */ if (err == 0 && !(flags & EXEC_OBJECT_ASYNC)) {
err = i915_request_await_object
(eb_find_first_request_added(eb), obj,
flags & EXEC_OBJECT_WRITE);
}
for_each_batch_add_order(eb, j) { if (err) break; if (!eb->requests[j]) continue;
staticstruct i915_vma *eb_dispatch_secure(struct i915_execbuffer *eb, struct i915_vma *vma)
{ /* * snb/ivb/vlv conflate the "batch in ppgtt" bit with the "non-secure * batch" bit. Hence we need to pin secure batches into the global gtt.
* hsw should have this fixed, but bdw mucks it up again. */ if (eb->batch_flags & I915_DISPATCH_SECURE) return i915_gem_object_ggtt_pin_ww(vma->obj, &eb->ww, NULL, 0, 0, PIN_VALIDATE);
if (intel_context_nopreempt(rq->context))
__set_bit(I915_FENCE_FLAG_NOPREEMPT, &rq->fence.flags);
if (eb->args->flags & I915_EXEC_GEN7_SOL_RESET) {
err = i915_reset_gen7_sol_offsets(rq); if (err) return err;
}
/* * After we completed waiting for other engines (using HW semaphores) * then we can signal that this request/batch is ready to run. This * allows us to determine if the batch is still waiting on the GPU * or actually running by checking the breadcrumb.
*/ if (rq->context->engine->emit_init_breadcrumb) {
err = rq->context->engine->emit_init_breadcrumb(rq); if (err) return err;
}
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.