/* Architectures should define their poll value according to the halt latency */ unsignedint halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
module_param(halt_poll_ns, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns);
/* The start value to grow halt_poll_ns from */ unsignedint halt_poll_ns_grow_start = 10000; /* 10us */
module_param(halt_poll_ns_grow_start, uint, 0644);
EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
/* * Allow direct access (from KVM or the CPU) without MMU notifier protection * to unpinned pages.
*/ staticbool allow_unsafe_mappings;
module_param(allow_unsafe_mappings, bool, 0444);
staticlong kvm_vcpu_ioctl(struct file *file, unsignedint ioctl, unsignedlong arg); #ifdef CONFIG_KVM_COMPAT staticlong kvm_vcpu_compat_ioctl(struct file *file, unsignedint ioctl, unsignedlong arg); #define KVM_COMPAT(c) .compat_ioctl = (c) #else /* * For architectures that don't implement a compat infrastructure, * adopt a double line of defense: * - Prevent a compat task from opening /dev/kvm * - If the open has been done by a 64bit task, and the KVM fd * passed to a compat task, let the ioctls fail.
*/ staticlong kvm_no_compat_ioctl(struct file *file, unsignedint ioctl, unsignedlong arg) { return -EINVAL; }
/* TODO: merge with kvm_arch_vcpu_should_kick */ staticbool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req)
{ int mode = kvm_vcpu_exiting_guest_mode(vcpu);
/* * We need to wait for the VCPU to reenable interrupts and get out of * READING_SHADOW_PAGE_TABLES mode.
*/ if (req & KVM_REQUEST_WAIT) return mode != OUTSIDE_GUEST_MODE;
/* * Need to kick a running VCPU, but otherwise there is nothing to do.
*/ return mode == IN_GUEST_MODE;
}
staticvoid ack_kick(void *_completed)
{
}
staticinlinebool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
{ if (cpumask_empty(cpus)) returnfalse;
staticvoid kvm_make_vcpu_request(struct kvm_vcpu *vcpu, unsignedint req, struct cpumask *tmp, int current_cpu)
{ int cpu;
if (likely(!(req & KVM_REQUEST_NO_ACTION)))
__kvm_make_request(req, vcpu);
if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) return;
/* * Note, the vCPU could get migrated to a different pCPU at any point * after kvm_request_needs_ipi(), which could result in sending an IPI * to the previous pCPU. But, that's OK because the purpose of the IPI * is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is * satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES * after this point is also OK, as the requirement is only that KVM wait * for vCPUs that were reading SPTEs _before_ any changes were * finalized. See kvm_vcpu_kick() for more details on handling requests.
*/ if (kvm_request_needs_ipi(vcpu, req)) {
cpu = READ_ONCE(vcpu->cpu); if (cpu != -1 && cpu != current_cpu)
__cpumask_set_cpu(cpu, tmp);
}
}
bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsignedint req, unsignedlong *vcpu_bitmap)
{ struct kvm_vcpu *vcpu; struct cpumask *cpus; int i, me; bool called;
/* * We want to publish modifications to the page tables before reading * mode. Pairs with a memory barrier in arch-specific code. * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest * and smp_mb in walk_shadow_page_lockless_begin/end. * - powerpc: smp_mb in kvmppc_prepare_to_enter. * * There is already an smp_mb__after_atomic() before * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that * barrier here.
*/ if (!kvm_arch_flush_remote_tlbs(kvm)
|| kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm->stat.generic.remote_tlb_flush;
}
EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
/* * Fall back to a flushing entire TLBs if the architecture range-based * TLB invalidation is unsupported or can't be performed for whatever * reason.
*/
kvm_flush_remote_tlbs(kvm);
}
void kvm_flush_remote_tlbs_memslot(struct kvm *kvm, conststruct kvm_memory_slot *memslot)
{ /* * All current use cases for flushing the TLBs for a specific memslot * are related to dirty logging, and many do the TLB flush out of * mmu_lock. The interaction between the various operations on memslot * must be serialized by slots_locks to ensure the TLB flush from one * operation is observed by any other operation on the same memslot.
*/
lockdep_assert_held(&kvm->slots_lock);
kvm_flush_remote_tlbs_range(kvm, memslot->base_gfn, memslot->npages);
}
int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min)
{
gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT; void *obj;
if (mc->nobjs >= min) return 0;
if (unlikely(!mc->objects)) { if (WARN_ON_ONCE(!capacity)) return -EIO;
/* * Custom init values can be used only for page allocations, * and obviously conflict with __GFP_ZERO.
*/ if (WARN_ON_ONCE(mc->init_value && (mc->kmem_cache || mc->gfp_zero))) return -EIO;
mc->objects = kvmalloc_array(capacity, sizeof(void *), gfp); if (!mc->objects) return -ENOMEM;
mc->capacity = capacity;
}
/* It is illegal to request a different capacity across topups. */ if (WARN_ON_ONCE(mc->capacity != capacity)) return -EIO;
while (mc->nobjs < mc->capacity) {
obj = mmu_memory_cache_alloc_obj(mc, gfp); if (!obj) return mc->nobjs >= min ? 0 : -ENOMEM;
mc->objects[mc->nobjs++] = obj;
} return 0;
}
int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min)
{ return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
}
int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
{ return mc->nobjs;
}
void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
{ while (mc->nobjs) { if (mc->kmem_cache)
kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); else
free_page((unsignedlong)mc->objects[--mc->nobjs]);
}
/* * No need for rcu_read_lock as VCPU_RUN is the only place that changes * the vcpu->pid pointer, and at destruction time all file descriptors * are already gone.
*/
put_pid(vcpu->pid);
/* * Assert that the vCPU isn't visible in any way, to ensure KVM * doesn't trigger a use-after-free if destroying vCPUs results * in VM-wide request, e.g. to flush remote TLBs when tearing * down MMUs, or to mark the VM dead if a KVM_BUG_ON() fires.
*/
WARN_ON_ONCE(xa_load(&kvm->vcpu_array, i) || kvm_get_vcpu(kvm, i));
}
struct kvm_mmu_notifier_range { /* * 64-bit addresses, as KVM notifiers can operate on host virtual * addresses (unsigned long) and guest physical addresses (64-bit).
*/
u64 start;
u64 end; union kvm_mmu_notifier_arg arg;
gfn_handler_t handler;
on_lock_fn_t on_lock; bool flush_on_ret; bool may_block; bool lockless;
};
/* * The inner-most helper returns a tuple containing the return value from the * arch- and action-specific handler, plus a flag indicating whether or not at * least one memslot was found, i.e. if the handler found guest memory. * * Note, most notifiers are averse to booleans, so even though KVM tracks the * return from arch code as a bool, outer helpers will cast it to an int. :-(
*/ typedefstruct kvm_mmu_notifier_return { bool ret; bool found_memslot;
} kvm_mn_ret_t;
/* * Use a dedicated stub instead of NULL to indicate that there is no callback * function/handler. The compiler technically can't guarantee that a real * function will have a non-zero address, and so it will generate code to * check for !NULL, whereas comparing against a stub will be elided at compile * time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
*/ staticvoid kvm_null_fn(void)
{
if (WARN_ON_ONCE(range->end <= range->start)) return r;
/* A null handler is allowed if and only if on_lock() is provided. */ if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
IS_KVM_NULL_FN(range->handler))) return r;
/* on_lock will never be called for lockless walks */ if (WARN_ON_ONCE(range->lockless && !IS_KVM_NULL_FN(range->on_lock))) return r;
idx = srcu_read_lock(&kvm->srcu);
for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { struct interval_tree_node *node;
/* * To optimize for the likely case where the address * range is covered by zero or one memslots, don't * bother making these conditional (to avoid writes on * the second or later invocation of the handler).
*/
gfn_range.arg = range->arg;
gfn_range.may_block = range->may_block; /* * HVA-based notifications aren't relevant to private * mappings as they don't have a userspace mapping.
*/
gfn_range.attr_filter = KVM_FILTER_SHARED;
void kvm_mmu_invalidate_begin(struct kvm *kvm)
{
lockdep_assert_held_write(&kvm->mmu_lock); /* * The count increase must become visible at unlock time as no * spte can be established without taking the mmu_lock and * count is also read inside the mmu_lock critical section.
*/
kvm->mmu_invalidate_in_progress++;
if (likely(kvm->mmu_invalidate_range_start == INVALID_GPA)) {
kvm->mmu_invalidate_range_start = start;
kvm->mmu_invalidate_range_end = end;
} else { /* * Fully tracking multiple concurrent ranges has diminishing * returns. Keep things simple and just find the minimal range * which includes the current and new ranges. As there won't be * enough information to subtract a range after its invalidate * completes, any ranges invalidated concurrently will * accumulate and persist until all outstanding invalidates * complete.
*/
kvm->mmu_invalidate_range_start =
min(kvm->mmu_invalidate_range_start, start);
kvm->mmu_invalidate_range_end =
max(kvm->mmu_invalidate_range_end, end);
}
}
/* * Prevent memslot modification between range_start() and range_end() * so that conditionally locking provides the same result in both * functions. Without that guarantee, the mmu_invalidate_in_progress * adjustments will be imbalanced. * * Pairs with the decrement in range_end().
*/
spin_lock(&kvm->mn_invalidate_lock);
kvm->mn_active_invalidate_count++;
spin_unlock(&kvm->mn_invalidate_lock);
/* * Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e. * before acquiring mmu_lock, to avoid holding mmu_lock while acquiring * each cache's lock. There are relatively few caches in existence at * any given time, and the caches themselves can check for hva overlap, * i.e. don't need to rely on memslot overlap checks for performance. * Because this runs without holding mmu_lock, the pfn caches must use * mn_active_invalidate_count (see above) instead of * mmu_invalidate_in_progress.
*/
gfn_to_pfn_cache_invalidate_start(kvm, range->start, range->end);
/* * If one or more memslots were found and thus zapped, notify arch code * that guest memory has been reclaimed. This needs to be done *after* * dropping mmu_lock, as x86's reclaim path is slooooow.
*/ if (kvm_handle_hva_range(kvm, &hva_range).found_memslot)
kvm_arch_guest_memory_reclaimed(kvm);
/* * This sequence increase will notify the kvm page fault that * the page that is going to be mapped in the spte could have * been freed.
*/
kvm->mmu_invalidate_seq++;
smp_wmb(); /* * The above sequence increase must be visible before the * below count decrease, which is ensured by the smp_wmb above * in conjunction with the smp_rmb in mmu_invalidate_retry().
*/
kvm->mmu_invalidate_in_progress--;
KVM_BUG_ON(kvm->mmu_invalidate_in_progress < 0, kvm);
/* * Assert that at least one range was added between start() and end(). * Not adding a range isn't fatal, but it is a KVM bug.
*/
WARN_ON_ONCE(kvm->mmu_invalidate_range_start == INVALID_GPA);
}
/* Pairs with the increment in range_start(). */
spin_lock(&kvm->mn_invalidate_lock); if (!WARN_ON_ONCE(!kvm->mn_active_invalidate_count))
--kvm->mn_active_invalidate_count;
wake = !kvm->mn_active_invalidate_count;
spin_unlock(&kvm->mn_invalidate_lock);
/* * There can only be one waiter, since the wait happens under * slots_lock.
*/ if (wake)
rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait);
}
/* * Even though we do not flush TLB, this will still adversely * affect performance on pre-Haswell Intel EPT, where there is * no EPT Access Bit to clear so that we have to tear down EPT * tables instead. If we find this unacceptable, we can always * add a parameter to kvm_age_hva so that it effectively doesn't * do anything on clear_young. * * Also note that currently we never issue secondary TLB flushes * from clear_young, leaving this job up to the regular system * cadence. If we find this inaccurate, we might come up with a * more sophisticated heuristic later.
*/ return kvm_age_hva_range_no_flush(mn, start, end, kvm_age_gfn);
}
/* This does not remove the slot from struct kvm_memslots data structures */ staticvoid kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{ if (slot->flags & KVM_MEM_GUEST_MEMFD)
kvm_gmem_unbind(slot);
/* * The same memslot objects live in both active and inactive sets, * arbitrarily free using index '1' so the second invocation of this * function isn't operating over a structure with dangling pointers * (even though this function isn't actually touching them).
*/ if (!slots->node_idx) return;
for (i = 0; i < kvm_vm_stats_header.num_desc; ++i) {
pdesc = &kvm_vm_stats_desc[i];
stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) goto out_err;
for (i = 0; i < kvm_vcpu_stats_header.num_desc; ++i) {
pdesc = &kvm_vcpu_stats_desc[i];
stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); if (!stat_data) goto out_err;
/* * Called just after removing the VM from the vm_list, but before doing any * other destruction.
*/ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
{
}
/* * Called after per-vm debugfs created. When called kvm->debugfs_dentry should * be setup already, so we can create arch-specific debugfs entries under it. * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so * a per-arch destroy interface is not needed.
*/ void __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
{
}
staticstruct kvm *kvm_create_vm(unsignedlong type, constchar *fdname)
{ struct kvm *kvm = kvm_arch_alloc_vm(); struct kvm_memslots *slots; int r, i, j;
/* * Force subsequent debugfs file creations to fail if the VM directory * is not created (by kvm_create_vm_debugfs()).
*/
kvm->debugfs_dentry = ERR_PTR(-ENOENT);
r = -ENOMEM; for (i = 0; i < KVM_NR_BUSES; i++) {
rcu_assign_pointer(kvm->buses[i],
kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); if (!kvm->buses[i]) goto out_err_no_arch_destroy_vm;
}
r = kvm_arch_init_vm(kvm, type); if (r) goto out_err_no_arch_destroy_vm;
r = kvm_enable_virtualization(); if (r) goto out_err_no_disable;
/* * We do not need to take the kvm->lock here, because nobody else * has a reference to the struct kvm at this point and therefore * cannot access the devices list anyhow. * * The device list is generally managed as an rculist, but list_del() * is used intentionally here. If a bug in KVM introduced a reader that * was not backed by a reference on the kvm struct, the hope is that * it'd consume the poisoned forward pointer instead of suffering a * use-after-free, even though this cannot be guaranteed.
*/
list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
list_del(&dev->vm_node);
dev->ops->destroy(dev);
}
}
kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) { struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
if (bus)
kvm_io_bus_destroy(bus);
kvm->buses[i] = NULL;
}
kvm_coalesced_mmio_free(kvm); #ifdef CONFIG_KVM_GENERIC_MMU_NOTIFIER
mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); /* * At this point, pending calls to invalidate_range_start() * have completed but no more MMU notifiers will run, so * mn_active_invalidate_count may remain unbalanced. * No threads can be waiting in kvm_swap_active_memslots() as the * last reference on KVM has been dropped, but freeing * memslots would deadlock without this manual intervention. * * If the count isn't unbalanced, i.e. KVM did NOT unregister its MMU * notifier between a start() and end(), then there shouldn't be any * in-progress invalidations.
*/
WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); if (kvm->mn_active_invalidate_count)
kvm->mn_active_invalidate_count = 0; else
WARN_ON(kvm->mmu_invalidate_in_progress); #else
kvm_flush_shadow_all(kvm); #endif
kvm_arch_destroy_vm(kvm);
kvm_destroy_devices(kvm); for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
kvm_free_memslots(kvm, &kvm->__memslots[i][0]);
kvm_free_memslots(kvm, &kvm->__memslots[i][1]);
}
cleanup_srcu_struct(&kvm->irq_srcu);
cleanup_srcu_struct(&kvm->srcu); #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
xa_destroy(&kvm->mem_attr_array); #endif
kvm_arch_free_vm(kvm);
preempt_notifier_dec();
kvm_disable_virtualization();
mmdrop(mm);
}
/* * Make sure the vm is not during destruction, which is a safe version of * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
*/ bool kvm_get_kvm_safe(struct kvm *kvm)
{ return refcount_inc_not_zero(&kvm->users_count);
}
EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
void kvm_put_kvm(struct kvm *kvm)
{ if (refcount_dec_and_test(&kvm->users_count))
kvm_destroy_vm(kvm);
}
EXPORT_SYMBOL_GPL(kvm_put_kvm);
/* * Used to put a reference that was taken on behalf of an object associated * with a user-visible file descriptor, e.g. a vcpu or device, if installation * of the new file descriptor fails and the reference cannot be transferred to * its final owner. In such cases, the caller is still actively using @kvm and * will fail miserably if the refcount unexpectedly hits zero.
*/ void kvm_put_kvm_no_destroy(struct kvm *kvm)
{
WARN_ON(refcount_dec_and_test(&kvm->users_count));
}
EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
/* * Allocation size is twice as large as the actual dirty bitmap size. * See kvm_vm_ioctl_get_dirty_log() why this is needed.
*/ staticint kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
{ unsignedlong dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
memslot->dirty_bitmap = __vcalloc(2, dirty_bytes, GFP_KERNEL_ACCOUNT); if (!memslot->dirty_bitmap) return -ENOMEM;
return 0;
}
staticstruct kvm_memslots *kvm_get_inactive_memslots(struct kvm *kvm, int as_id)
{ struct kvm_memslots *active = __kvm_memslots(kvm, as_id); int node_idx_inactive = active->node_idx ^ 1;
/* * Helper to get the address space ID when one of memslot pointers may be NULL. * This also serves as a sanity that at least one of the pointers is non-NULL, * and that their address space IDs don't diverge.
*/ staticint kvm_memslots_get_as_id(struct kvm_memory_slot *a, struct kvm_memory_slot *b)
{ if (WARN_ON_ONCE(!a && !b)) return 0;
/* * Replace @old with @new in the inactive memslots. * * With NULL @old this simply adds @new. * With NULL @new this simply removes @old. * * If @new is non-NULL its hva_node[slots_idx] range has to be set * appropriately.
*/ staticvoid kvm_replace_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new)
{ int as_id = kvm_memslots_get_as_id(old, new); struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id); int idx = slots->node_idx;
if (old) {
hash_del(&old->id_node[idx]);
interval_tree_remove(&old->hva_node[idx], &slots->hva_tree);
if ((long)old == atomic_long_read(&slots->last_used_slot))
atomic_long_set(&slots->last_used_slot, (long)new);
if (!new) {
kvm_erase_gfn_node(slots, old); return;
}
}
/* * Initialize @new's hva range. Do this even when replacing an @old * slot, kvm_copy_memslot() deliberately does not touch node data.
*/
new->hva_node[idx].start = new->userspace_addr;
new->hva_node[idx].last = new->userspace_addr +
(new->npages << PAGE_SHIFT) - 1;
/* * (Re)Add the new memslot. There is no O(1) interval_tree_replace(), * hva_node needs to be swapped with remove+insert even though hva can't * change when replacing an existing slot.
*/
hash_add(slots->id_hash, &new->id_node[idx], new->id);
interval_tree_insert(&new->hva_node[idx], &slots->hva_tree);
/* * If the memslot gfn is unchanged, rb_replace_node() can be used to * switch the node in the gfn tree instead of removing the old and * inserting the new as two separate operations. Replacement is a * single O(1) operation versus two O(log(n)) operations for * remove+insert.
*/ if (old && old->base_gfn == new->base_gfn) {
kvm_replace_gfn_node(slots, old, new);
} else { if (old)
kvm_erase_gfn_node(slots, old);
kvm_insert_gfn_node(slots, new);
}
}
/* * Flags that do not access any of the extra space of struct * kvm_userspace_memory_region2. KVM_SET_USER_MEMORY_REGION_V1_FLAGS * only allows these.
*/ #define KVM_SET_USER_MEMORY_REGION_V1_FLAGS \
(KVM_MEM_LOG_DIRTY_PAGES | KVM_MEM_READONLY)
if (kvm_arch_has_private_mem(kvm))
valid_flags |= KVM_MEM_GUEST_MEMFD;
/* Dirty logging private memory is not currently supported. */ if (mem->flags & KVM_MEM_GUEST_MEMFD)
valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
/* * GUEST_MEMFD is incompatible with read-only memslots, as writes to * read-only memslots have emulated MMIO, not page fault, semantics, * and KVM doesn't allow emulated MMIO for private memory.
*/ if (kvm_arch_has_readonly_mem(kvm) &&
!(mem->flags & KVM_MEM_GUEST_MEMFD))
valid_flags |= KVM_MEM_READONLY;
/* Grab the generation from the activate memslots. */
u64 gen = __kvm_memslots(kvm, as_id)->generation;
WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
/* * Do not store the new memslots while there are invalidations in * progress, otherwise the locking in invalidate_range_start and * invalidate_range_end will be unbalanced.
*/
spin_lock(&kvm->mn_invalidate_lock);
prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); while (kvm->mn_active_invalidate_count) {
set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&kvm->mn_invalidate_lock);
schedule();
spin_lock(&kvm->mn_invalidate_lock);
}
finish_rcuwait(&kvm->mn_memslots_update_rcuwait);
rcu_assign_pointer(kvm->memslots[as_id], slots);
spin_unlock(&kvm->mn_invalidate_lock);
/* * Acquired in kvm_set_memslot. Must be released before synchronize * SRCU below in order to avoid deadlock with another thread * acquiring the slots_arch_lock in an srcu critical section.
*/
mutex_unlock(&kvm->slots_arch_lock);
synchronize_srcu_expedited(&kvm->srcu);
/* * Increment the new memslot generation a second time, dropping the * update in-progress flag and incrementing the generation based on * the number of address spaces. This provides a unique and easily * identifiable generation number while the memslots are in flux.
*/
gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
/* * Generations must be unique even across address spaces. We do not need * a global counter for that, instead the generation space is evenly split * across address spaces. For example, with two address spaces, address * space 0 will use generations 0, 2, 4, ... while address space 1 will * use generations 1, 3, 5, ...
*/
gen += kvm_arch_nr_memslot_as_ids(kvm);
/* * If dirty logging is disabled, nullify the bitmap; the old bitmap * will be freed on "commit". If logging is enabled in both old and * new, reuse the existing bitmap. If logging is enabled only in the * new and KVM isn't using a ring buffer, allocate and initialize a * new bitmap.
*/ if (change != KVM_MR_DELETE) { if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
new->dirty_bitmap = NULL; elseif (old && old->dirty_bitmap)
new->dirty_bitmap = old->dirty_bitmap; elseif (kvm_use_dirty_bitmap(kvm)) {
r = kvm_alloc_dirty_bitmap(new); if (r) return r;
if (kvm_dirty_log_manual_protect_and_init_set(kvm))
bitmap_set(new->dirty_bitmap, 0, new->npages);
}
}
r = kvm_arch_prepare_memory_region(kvm, old, new, change);
/* Free the bitmap on failure if it was allocated above. */ if (r && new && new->dirty_bitmap && (!old || !old->dirty_bitmap))
kvm_destroy_dirty_bitmap(new);
return r;
}
staticvoid kvm_commit_memory_region(struct kvm *kvm, struct kvm_memory_slot *old, conststruct kvm_memory_slot *new, enum kvm_mr_change change)
{ int old_flags = old ? old->flags : 0; int new_flags = new ? new->flags : 0; /* * Update the total number of memslot pages before calling the arch * hook so that architectures can consume the result directly.
*/ if (change == KVM_MR_DELETE)
kvm->nr_memslot_pages -= old->npages; elseif (change == KVM_MR_CREATE)
kvm->nr_memslot_pages += new->npages;
switch (change) { case KVM_MR_CREATE: /* Nothing more to do. */ break; case KVM_MR_DELETE: /* Free the old memslot and all its metadata. */
kvm_free_memslot(kvm, old); break; case KVM_MR_MOVE: case KVM_MR_FLAGS_ONLY: /* * Free the dirty bitmap as needed; the below check encompasses * both the flags and whether a ring buffer is being used)
*/ if (old->dirty_bitmap && !new->dirty_bitmap)
kvm_destroy_dirty_bitmap(old);
/* * The final quirk. Free the detached, old slot, but only its * memory, not any metadata. Metadata, including arch specific * data, may be reused by @new.
*/
kfree(old); break; default:
BUG();
}
}
/* * Activate @new, which must be installed in the inactive slots by the caller, * by swapping the active slots and then propagating @new to @old once @old is * unreachable and can be safely modified. * * With NULL @old this simply adds @new to @active (while swapping the sets). * With NULL @new this simply removes @old from @active and frees it * (while also swapping the sets).
*/ staticvoid kvm_activate_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new)
{ int as_id = kvm_memslots_get_as_id(old, new);
kvm_swap_active_memslots(kvm, as_id);
/* Propagate the new memslot to the now inactive memslots. */
kvm_replace_memslot(kvm, old, new);
}
staticvoid kvm_invalidate_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *invalid_slot)
{ /* * Mark the current slot INVALID. As with all memslot modifications, * this must be done on an unreachable slot to avoid modifying the * current slot in the active tree.
*/
kvm_copy_memslot(invalid_slot, old);
invalid_slot->flags |= KVM_MEMSLOT_INVALID;
kvm_replace_memslot(kvm, old, invalid_slot);
/* * Activate the slot that is now marked INVALID, but don't propagate * the slot to the now inactive slots. The slot is either going to be * deleted or recreated as a new slot.
*/
kvm_swap_active_memslots(kvm, old->as_id);
/* * From this point no new shadow pages pointing to a deleted, or moved, * memslot will be created. Validation of sp->gfn happens in: * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) * - kvm_is_visible_gfn (mmu_check_root)
*/
kvm_arch_flush_shadow_memslot(kvm, old);
kvm_arch_guest_memory_reclaimed(kvm);
/* Was released by kvm_swap_active_memslots(), reacquire. */
mutex_lock(&kvm->slots_arch_lock);
/* * Copy the arch-specific field of the newly-installed slot back to the * old slot as the arch data could have changed between releasing * slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock * above. Writers are required to retrieve memslots *after* acquiring * slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
*/
old->arch = invalid_slot->arch;
}
staticvoid kvm_create_memslot(struct kvm *kvm, struct kvm_memory_slot *new)
{ /* Add the new memslot to the inactive set and activate. */
kvm_replace_memslot(kvm, NULL, new);
kvm_activate_memslot(kvm, NULL, new);
}
staticvoid kvm_delete_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *invalid_slot)
{ /* * Remove the old memslot (in the inactive memslots) by passing NULL as * the "new" slot, and for the invalid version in the active slots.
*/
kvm_replace_memslot(kvm, old, NULL);
kvm_activate_memslot(kvm, invalid_slot, NULL);
}
staticvoid kvm_move_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new, struct kvm_memory_slot *invalid_slot)
{ /* * Replace the old memslot in the inactive slots, and then swap slots * and replace the current INVALID with the new as well.
*/
kvm_replace_memslot(kvm, old, new);
kvm_activate_memslot(kvm, invalid_slot, new);
}
staticvoid kvm_update_flags_memslot(struct kvm *kvm, struct kvm_memory_slot *old, struct kvm_memory_slot *new)
{ /* * Similar to the MOVE case, but the slot doesn't need to be zapped as * an intermediate step. Instead, the old memslot is simply replaced * with a new, updated copy in both memslot sets.
*/
kvm_replace_memslot(kvm, old, new);
kvm_activate_memslot(kvm, old, new);
}
/* * Released in kvm_swap_active_memslots(). * * Must be held from before the current memslots are copied until after * the new memslots are installed with rcu_assign_pointer, then * released before the synchronize srcu in kvm_swap_active_memslots(). * * When modifying memslots outside of the slots_lock, must be held * before reading the pointer to the current memslots until after all * changes to those memslots are complete. * * These rules ensure that installing new memslots does not lose * changes made to the previous memslots.
*/
mutex_lock(&kvm->slots_arch_lock);
/* * Invalidate the old slot if it's being deleted or moved. This is * done prior to actually deleting/moving the memslot to allow vCPUs to * continue running by ensuring there are no mappings or shadow pages * for the memslot when it is deleted/moved. Without pre-invalidation * (and without a lock), a window would exist between effecting the * delete/move and committing the changes in arch code where KVM or a * guest could access a non-existent memslot. * * Modifications are done on a temporary, unreachable slot. The old * slot needs to be preserved in case a later step fails and the * invalidation needs to be reverted.
*/ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
invalid_slot = kzalloc(sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT); if (!invalid_slot) {
mutex_unlock(&kvm->slots_arch_lock); return -ENOMEM;
}
kvm_invalidate_memslot(kvm, old, invalid_slot);
}
r = kvm_prepare_memory_region(kvm, old, new, change); if (r) { /* * For DELETE/MOVE, revert the above INVALID change. No * modifications required since the original slot was preserved * in the inactive slots. Changing the active memslots also * release slots_arch_lock.
*/ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
kvm_activate_memslot(kvm, invalid_slot, old);
kfree(invalid_slot);
} else {
mutex_unlock(&kvm->slots_arch_lock);
} return r;
}
/* * For DELETE and MOVE, the working slot is now active as the INVALID * version of the old slot. MOVE is particularly special as it reuses * the old slot and returns a copy of the old slot (in working_slot). * For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the * old slot is detached but otherwise preserved.
*/ if (change == KVM_MR_CREATE)
kvm_create_memslot(kvm, new); elseif (change == KVM_MR_DELETE)
kvm_delete_memslot(kvm, old, invalid_slot); elseif (change == KVM_MR_MOVE)
kvm_move_memslot(kvm, old, new, invalid_slot); elseif (change == KVM_MR_FLAGS_ONLY)
kvm_update_flags_memslot(kvm, old, new); else
BUG();
/* Free the temporary INVALID slot used for DELETE and MOVE. */ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
kfree(invalid_slot);
/* * No need to refresh new->arch, changes after dropping slots_arch_lock * will directly hit the final, active memslot. Architectures are * responsible for knowing that new->arch may be stale.
*/
kvm_commit_memory_region(kvm, old, new, change);
r = check_memory_region_flags(kvm, mem); if (r) return r;
as_id = mem->slot >> 16;
id = (u16)mem->slot;
/* General sanity checks */ if ((mem->memory_size & (PAGE_SIZE - 1)) ||
(mem->memory_size != (unsignedlong)mem->memory_size)) return -EINVAL; if (mem->guest_phys_addr & (PAGE_SIZE - 1)) return -EINVAL; /* We can read the guest memory with __xxx_user() later on. */ if ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
(mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
!access_ok((void __user *)(unsignedlong)mem->userspace_addr,
mem->memory_size)) return -EINVAL; if (mem->flags & KVM_MEM_GUEST_MEMFD &&
(mem->guest_memfd_offset & (PAGE_SIZE - 1) ||
mem->guest_memfd_offset + mem->memory_size < mem->guest_memfd_offset)) return -EINVAL; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_MEM_SLOTS_NUM) return -EINVAL; if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) return -EINVAL;
/* * The size of userspace-defined memory regions is restricted in order * to play nice with dirty bitmap operations, which are indexed with an * "unsigned int". KVM's internal memory regions don't support dirty * logging, and so are exempt.
*/ if (id < KVM_USER_MEM_SLOTS &&
(mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES) return -EINVAL;
slots = __kvm_memslots(kvm, as_id);
/* * Note, the old memslot (and the pointer itself!) may be invalidated * and/or destroyed by kvm_set_memslot().
*/
old = id_to_memslot(slots, id);
if (!mem->memory_size) { if (!old || !old->npages) return -EINVAL;
if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages)) return -EIO;
if (!old || !old->npages) {
change = KVM_MR_CREATE;
/* * To simplify KVM internals, the total number of pages across * all memslots must fit in an unsigned long.
*/ if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages) return -EINVAL;
} else { /* Modify an existing slot. */ /* Private memslots are immutable, they can only be deleted. */ if (mem->flags & KVM_MEM_GUEST_MEMFD) return -EINVAL; if ((mem->userspace_addr != old->userspace_addr) ||
(npages != old->npages) ||
((mem->flags ^ old->flags) & KVM_MEM_READONLY)) return -EINVAL;
#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT /** * kvm_get_dirty_log - get a snapshot of dirty pages * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * @is_dirty: set to '1' if any dirty pages were found * @memslot: set to the associated memslot, always valid on success
*/ int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, int *is_dirty, struct kvm_memory_slot **memslot)
{ struct kvm_memslots *slots; int i, as_id, id; unsignedlong n; unsignedlong any = 0;
/* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO;
*memslot = NULL;
*is_dirty = 0;
as_id = log->slot >> 16;
id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL;
for (i = 0; !any && i < n/sizeof(long); ++i)
any = (*memslot)->dirty_bitmap[i];
if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) return -EFAULT;
if (any)
*is_dirty = 1; return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
#else/* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ /** * kvm_get_dirty_log_protect - get a snapshot of dirty pages * and reenable dirty page tracking for the corresponding pages. * @kvm: pointer to kvm instance * @log: slot id and address to which we copy the log * * We need to keep it in mind that VCPU threads can write to the bitmap * concurrently. So, to avoid losing track of dirty pages we keep the * following order: * * 1. Take a snapshot of the bit and clear it if needed. * 2. Write protect the corresponding page. * 3. Copy the snapshot to the userspace. * 4. Upon return caller flushes TLB's if needed. * * Between 2 and 4, the guest may write to the page using the remaining TLB * entry. This is not a problem because the page is reported dirty using * the snapshot taken before and step 4 ensures that writes done after * exiting to userspace will be logged for the next call. *
*/ staticint kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log)
{ struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i, as_id, id; unsignedlong n; unsignedlong *dirty_bitmap; unsignedlong *dirty_bitmap_buffer; bool flush;
/* Dirty ring tracking may be exclusive to dirty log tracking */ if (!kvm_use_dirty_bitmap(kvm)) return -ENXIO;
as_id = log->slot >> 16;
id = (u16)log->slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return -EINVAL;
n = kvm_dirty_bitmap_bytes(memslot);
flush = false; if (kvm->manual_dirty_log_protect) { /* * Unlike kvm_get_dirty_log, we always return false in *flush, * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There * is some code duplication between this function and * kvm_get_dirty_log, but hopefully all architecture * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log * can be eliminated.
*/
dirty_bitmap_buffer = dirty_bitmap;
} else {
dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
memset(dirty_bitmap_buffer, 0, n);
KVM_MMU_LOCK(kvm); for (i = 0; i < n / sizeof(long); i++) { unsignedlong mask;
gfn_t offset;
if (flush)
kvm_flush_remote_tlbs_memslot(kvm, memslot);
if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) return -EFAULT; return 0;
}
/** * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot * @kvm: kvm instance * @log: slot id and address to which we copy the log * * Steps 1-4 below provide general overview of dirty page logging. See * kvm_get_dirty_log_protect() function description for additional details. * * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we * always flush the TLB (step 4) even if previous step failed and the dirty * bitmap may be corrupt. Regardless of previous outcome the KVM logging API * does not preclude user space subsequent dirty log read. Flushing TLB ensures
--> --------------------
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.