staticstruct rb_root uprobes_tree = RB_ROOT; /* * allows us to skip the uprobe_mmap if there are no uprobe events active * at this time. Probably a fine grained per inode count is better?
*/ #define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
/* Have a copy of original instruction */ #define UPROBE_COPY_INSN 0
struct uprobe { struct rb_node rb_node; /* node in the rb tree */
refcount_t ref; struct rw_semaphore register_rwsem; struct rw_semaphore consumer_rwsem; struct list_head pending_list; struct list_head consumers; struct inode *inode; /* Also hold a ref to inode */ union { struct rcu_head rcu; struct work_struct work;
};
loff_t offset;
loff_t ref_ctr_offset; unsignedlong flags; /* "unsigned long" so bitops work */
/* * The generic code assumes that it has two members of unknown type * owned by the arch-specific code: * * insn - copy_insn() saves the original instruction here for * arch_uprobe_analyze_insn(). * * ixol - potentially modified instruction to execute out of * line, copied to xol_area by xol_get_insn_slot().
*/ struct arch_uprobe arch;
};
/* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction * mangled by set_swbp(). * * On a breakpoint hit, thread contests for a slot. It frees the * slot after singlestep. Currently a fixed number of slots are * allocated.
*/ struct xol_area {
wait_queue_head_t wq; /* if all slots are busy */ unsignedlong *bitmap; /* 0 = free slot */
struct page *page; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully.
*/ unsignedlong vaddr; /* Page(s) of instruction slots */
};
/* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have * changed after breakpoint was inserted. * - is_register: indicates if we are in register context. * - Return 1 if the specified virtual address is in an * executable vma.
*/ staticbool valid_vma(struct vm_area_struct *vma, bool is_register)
{
vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
/** * is_swbp_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_swbp_insn * Returns true if @insn is a breakpoint instruction.
*/ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
{ return *insn == UPROBE_SWBP_INSN;
}
/** * is_trap_insn - check if instruction is breakpoint instruction. * @insn: instruction to be checked. * Default implementation of is_trap_insn * Returns true if @insn is a breakpoint instruction. * * This function is needed for the case where an architecture has multiple * trap instructions (like powerpc).
*/ bool __weak is_trap_insn(uprobe_opcode_t *insn)
{ return is_swbp_insn(insn);
}
/* * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. * We do not check if it is any other 'trap variant' which could * be conditional trap instruction such as the one powerpc supports. * * The logic is that we do not care if the underlying instruction * is a trap variant; uprobes always wins over any other (gdb) * breakpoint.
*/
copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
if (is_swbp_insn(new_opcode)) { if (is_swbp) /* register: already installed? */ return 0;
} else { if (!is_swbp) /* unregister: was it changed by us? */ return 0;
}
for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp;
return NULL;
}
staticint
__update_ref_ctr(struct mm_struct *mm, unsignedlong vaddr, short d)
{ void *kaddr; struct page *page; int ret; short *ptr;
if (!vaddr || !d) return -EINVAL;
ret = get_user_pages_remote(mm, vaddr, 1,
FOLL_WRITE, &page, NULL); if (unlikely(ret <= 0)) { /* * We are asking for 1 page. If get_user_pages_remote() fails, * it may return 0, in that case we have to return error.
*/ return ret == 0 ? -EBUSY : ret;
}
staticvoid update_ref_ctr_warn(struct uprobe *uprobe, struct mm_struct *mm, short d)
{
pr_warn("ref_ctr %s failed for inode: 0x%lx offset: " "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
(unsignedlonglong) uprobe->offset,
(unsignedlonglong) uprobe->ref_ctr_offset, mm);
}
staticint update_ref_ctr(struct uprobe *uprobe, struct mm_struct *mm, short d)
{ struct vm_area_struct *rc_vma; unsignedlong rc_vaddr; int ret = 0;
rc_vma = find_ref_ctr_vma(uprobe, mm);
if (rc_vma) {
rc_vaddr = offset_to_vaddr(rc_vma, uprobe->ref_ctr_offset);
ret = __update_ref_ctr(mm, rc_vaddr, d); if (ret)
update_ref_ctr_warn(uprobe, mm, d);
if (d > 0) return ret;
}
mutex_lock(&delayed_uprobe_lock); if (d > 0)
ret = delayed_uprobe_add(uprobe, mm); else
delayed_uprobe_remove(uprobe, mm);
mutex_unlock(&delayed_uprobe_lock);
/* For now, we'll only handle PTE-mapped folios. */ if (fw->level != FW_LEVEL_PTE) return -EFAULT;
/* * See can_follow_write_pte(): we'd actually prefer a writable PTE here, * but the VMA might not be writable.
*/ if (!pte_write(fw->pte)) { if (!PageAnonExclusive(fw->page)) return -EFAULT; if (unlikely(userfaultfd_pte_wp(vma, fw->pte))) return -EFAULT; /* SOFTDIRTY is handled via pte_mkdirty() below. */
}
/* * We'll temporarily unmap the page and flush the TLB, such that we can * modify the page atomically.
*/
flush_cache_page(vma, vaddr, pte_pfn(fw->pte));
fw->pte = ptep_clear_flush(vma, vaddr, fw->ptep);
copy_to_page(fw->page, opcode_vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
/* * When unregistering, we may only zap a PTE if uffd is disabled and * there are no unexpected folio references ...
*/ if (is_register || userfaultfd_missing(vma) ||
(folio_ref_count(folio) != folio_expected_ref_count(folio) + 1)) goto remap;
/* * ... and the mapped page is identical to the original page that * would get faulted in on next access.
*/ if (!orig_page_is_identical(vma, vaddr, fw->page, &pmd_mappable)) goto remap;
return pmd_mappable;
remap: /* * Make sure that our copy_to_page() changes become visible before the * set_pte_at() write.
*/
smp_wmb(); /* We modified the page. Make sure to mark the PTE dirty. */
set_pte_at(vma->vm_mm, vaddr, fw->ptep, pte_mkdirty(fw->pte)); return 0;
}
/* * NOTE: * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction * supported by that architecture then we need to modify is_trap_at_addr and * uprobe_write_opcode accordingly. This would never be a problem for archs * that have fixed length instructions. * * uprobe_write_opcode - write the opcode at a given virtual address. * @auprobe: arch specific probepoint information. * @vma: the probed virtual memory area. * @opcode_vaddr: the virtual address to store the opcode. * @opcode: opcode to be written at @opcode_vaddr. * * Called with mm->mmap_lock held for read or write. * Return 0 (success) or a negative errno.
*/ int uprobe_write_opcode(struct arch_uprobe *auprobe, struct vm_area_struct *vma, constunsignedlong opcode_vaddr, uprobe_opcode_t opcode)
{ constunsignedlong vaddr = opcode_vaddr & PAGE_MASK; struct mm_struct *mm = vma->vm_mm; struct uprobe *uprobe; int ret, is_register, ref_ctr_updated = 0; unsignedint gup_flags = FOLL_FORCE; struct mmu_notifier_range range; struct folio_walk fw; struct folio *folio; struct page *page;
if (WARN_ON_ONCE(!is_cow_mapping(vma->vm_flags))) return -EINVAL;
/* * When registering, we have to break COW to get an exclusive anonymous * page that we can safely modify. Use FOLL_WRITE to trigger a write * fault if required. When unregistering, we might be lucky and the * anon page is already gone. So defer write faults until really * required. Use FOLL_SPLIT_PMD, because __uprobe_write_opcode() * cannot deal with PMDs yet.
*/ if (is_register)
gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
retry:
ret = get_user_pages_remote(mm, vaddr, 1, gup_flags, &page, NULL); if (ret <= 0) goto out;
folio = page_folio(page);
ret = verify_opcode(page, opcode_vaddr, &opcode); if (ret <= 0) {
folio_put(folio); goto out;
}
/* We are going to replace instruction, update ref_ctr. */ if (!ref_ctr_updated && uprobe->ref_ctr_offset) {
ret = update_ref_ctr(uprobe, mm, is_register ? 1 : -1); if (ret) {
folio_put(folio); goto out;
}
ref_ctr_updated = 1;
}
ret = 0; if (unlikely(!folio_test_anon(folio) || folio_is_zone_device(folio))) {
VM_WARN_ON_ONCE(is_register);
folio_put(folio); goto out;
}
if (!is_register) { /* * In the common case, we'll be able to zap the page when * unregistering. So trigger MMU notifiers now, as we won't * be able to do it under PTL.
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
vaddr, vaddr + PAGE_SIZE);
mmu_notifier_invalidate_range_start(&range);
}
ret = -EAGAIN; /* Walk the page tables again, to perform the actual update. */ if (folio_walk_start(&fw, vma, vaddr, 0)) { if (fw.page == page)
ret = __uprobe_write_opcode(vma, &fw, folio, opcode_vaddr, opcode);
folio_walk_end(&fw, vma);
}
if (!is_register)
mmu_notifier_invalidate_range_end(&range);
folio_put(folio); switch (ret) { case -EFAULT:
gup_flags |= FOLL_WRITE | FOLL_SPLIT_PMD;
fallthrough; case -EAGAIN: goto retry; default: break;
}
out: /* Revert back reference counter if instruction update failed. */ if (ret < 0 && ref_ctr_updated)
update_ref_ctr(uprobe, mm, is_register ? -1 : 1);
/* try collapse pmd for compound page */ if (ret > 0)
collapse_pte_mapped_thp(mm, vaddr, false);
return ret < 0 ? ret : 0;
}
/** * set_swbp - store breakpoint at a given address. * @auprobe: arch specific probepoint information. * @vma: the probed virtual memory area. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, store the breakpoint instruction at @vaddr. * Return 0 (success) or a negative errno.
*/ int __weak set_swbp(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsignedlong vaddr)
{ return uprobe_write_opcode(auprobe, vma, vaddr, UPROBE_SWBP_INSN);
}
/** * set_orig_insn - Restore the original instruction. * @vma: the probed virtual memory area. * @auprobe: arch specific probepoint information. * @vaddr: the virtual address to insert the opcode. * * For mm @mm, restore the original opcode (opcode) at @vaddr. * Return 0 (success) or a negative errno.
*/ int __weak set_orig_insn(struct arch_uprobe *auprobe, struct vm_area_struct *vma, unsignedlong vaddr)
{ return uprobe_write_opcode(auprobe, vma, vaddr,
*(uprobe_opcode_t *)&auprobe->insn);
}
/* uprobe should have guaranteed positive refcount */ staticstruct uprobe *get_uprobe(struct uprobe *uprobe)
{
refcount_inc(&uprobe->ref); return uprobe;
}
/* * uprobe should have guaranteed lifetime, which can be either of: * - caller already has refcount taken (and wants an extra one); * - uprobe is RCU protected and won't be freed until after grace period; * - we are holding uprobes_treelock (for read or write, doesn't matter).
*/ staticstruct uprobe *try_get_uprobe(struct uprobe *uprobe)
{ if (refcount_inc_not_zero(&uprobe->ref)) return uprobe; return NULL;
}
if (uprobe_is_active(uprobe)) {
write_seqcount_begin(&uprobes_seqcount);
rb_erase(&uprobe->rb_node, &uprobes_tree);
write_seqcount_end(&uprobes_seqcount);
}
write_unlock(&uprobes_treelock);
/* * If application munmap(exec_vma) before uprobe_unregister() * gets called, we don't get a chance to remove uprobe from * delayed_uprobe_list from remove_breakpoint(). Do it here.
*/
mutex_lock(&delayed_uprobe_lock);
delayed_uprobe_remove(uprobe, NULL);
mutex_unlock(&delayed_uprobe_lock);
/* * hprobe_consume() fetches hprobe's underlying uprobe and detects whether * uprobe is SRCU protected or is refcounted. hprobe_consume() can be * used only once for a given hprobe. * * Caller has to call hprobe_finalize() and pass previous hprobe_state, so * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever * is appropriate.
*/ staticinlinestruct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate)
{
*hstate = xchg(&hprobe->state, HPROBE_CONSUMED); switch (*hstate) { case HPROBE_LEASED: case HPROBE_STABLE: return hprobe->uprobe; case HPROBE_GONE: /* uprobe is NULL, no SRCU */ case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ return NULL; default:
WARN(1, "hprobe invalid state %d", *hstate); return NULL;
}
}
/* * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. * hprobe_finalize() can only be used from current context after * hprobe_consume() call (which determines uprobe and hstate value).
*/ staticvoid hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate)
{ switch (hstate) { case HPROBE_LEASED:
__srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); break; case HPROBE_STABLE:
put_uprobe(hprobe->uprobe); break; case HPROBE_GONE: case HPROBE_CONSUMED: break; default:
WARN(1, "hprobe invalid state %d", hstate); break;
}
}
/* * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of * them can win the race to perform SRCU unlocking. Whoever wins must perform * SRCU unlock. * * Returns underlying valid uprobe or NULL, if there was no underlying uprobe * to begin with or we failed to bump its refcount and it's going away. * * Returned non-NULL uprobe can be still safely used within an ongoing SRCU * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has * an extra refcount for caller to assume and use. Otherwise, it's not * guaranteed that returned uprobe has a positive refcount, so caller has to * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current * SRCU lock region. See dup_utask().
*/ staticstruct uprobe *hprobe_expire(struct hprobe *hprobe, bool get)
{ enum hprobe_state hstate;
/* * Caller should guarantee that return_instance is not going to be * freed from under us. This can be achieved either through holding * rcu_read_lock() or by owning return_instance in the first place. * * Underlying uprobe is itself protected from reuse by SRCU, so ensure * SRCU lock is held properly.
*/
lockdep_assert(srcu_read_lock_held(&uretprobes_srcu));
hstate = READ_ONCE(hprobe->state); switch (hstate) { case HPROBE_STABLE: /* uprobe has positive refcount, bump refcount, if necessary */ return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; case HPROBE_GONE: /* * SRCU was unlocked earlier and we didn't manage to take * uprobe refcnt, so it's effectively NULL
*/ return NULL; case HPROBE_CONSUMED: /* * uprobe was consumed, so it's effectively NULL as far as * uretprobe processing logic is concerned
*/ return NULL; case HPROBE_LEASED: { struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); /* * Try to switch hprobe state, guarding against * hprobe_consume() or another hprobe_expire() racing with us. * Note, if we failed to get uprobe refcount, we use special * HPROBE_GONE state to signal that hprobe->uprobe shouldn't * be used as it will be freed after SRCU is unlocked.
*/ if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { /* We won the race, we are the ones to unlock SRCU */
__srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); return get ? get_uprobe(uprobe) : uprobe;
}
/* * We lost the race, undo refcount bump (if it ever happened), * unless caller would like an extra refcount anyways.
*/ if (uprobe && !get)
put_uprobe(uprobe); /* * Even if hprobe_consume() or another hprobe_expire() wins * the state update race and unlocks SRCU from under us, we * still have a guarantee that underyling uprobe won't be * freed due to ongoing caller's SRCU lock region, so we can * return it regardless. Also, if `get` was true, we also have * an extra ref for the caller to own. This is used in dup_utask().
*/ return uprobe;
} default:
WARN(1, "unknown hprobe state %d", hstate); return NULL;
}
}
static __always_inline int uprobe_cmp(conststruct inode *l_inode, const loff_t l_offset, conststruct uprobe *r)
{ if (l_inode < r->inode) return -1;
/* * Assumes being inside RCU protected region. * No refcount is taken on returned uprobe.
*/ staticstruct uprobe *find_uprobe_rcu(struct inode *inode, loff_t offset)
{ struct __uprobe_key key = {
.inode = inode,
.offset = offset,
}; struct rb_node *node; unsignedint seq;
lockdep_assert(rcu_read_lock_trace_held());
do {
seq = read_seqcount_begin(&uprobes_seqcount);
node = rb_find_rcu(&key, &uprobes_tree, __uprobe_cmp_key); /* * Lockless RB-tree lookups can result only in false negatives. * If the element is found, it is correct and can be returned * under RCU protection. If we find nothing, we need to * validate that seqcount didn't change. If it did, we have to * try again as we might have missed the element (false * negative). If seqcount is unchanged, search truly failed.
*/ if (node) return __node_2_uprobe(node);
} while (read_seqcount_retry(&uprobes_seqcount, seq));
return NULL;
}
/* * Attempt to insert a new uprobe into uprobes_tree. * * If uprobe already exists (for given inode+offset), we just increment * refcount of previously existing uprobe. * * If not, a provided new instance of uprobe is inserted into the tree (with * assumed initial refcount == 1). * * In any case, we return a uprobe instance that ends up being in uprobes_tree. * Caller has to clean up new uprobe instance, if it ended up not being * inserted into the tree. * * We assume that uprobes_treelock is held for writing.
*/ staticstruct uprobe *__insert_uprobe(struct uprobe *uprobe)
{ struct rb_node *node;
again:
node = rb_find_add_rcu(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); if (node) { struct uprobe *u = __node_2_uprobe(node);
if (!try_get_uprobe(u)) {
rb_erase(node, &uprobes_tree);
RB_CLEAR_NODE(&u->rb_node); goto again;
}
return u;
}
return uprobe;
}
/* * Acquire uprobes_treelock and insert uprobe into uprobes_tree * (or reuse existing one, see __insert_uprobe() comments above).
*/ staticstruct uprobe *insert_uprobe(struct uprobe *uprobe)
{ struct uprobe *u;
write_lock(&uprobes_treelock);
write_seqcount_begin(&uprobes_seqcount);
u = __insert_uprobe(uprobe);
write_seqcount_end(&uprobes_seqcount);
write_unlock(&uprobes_treelock);
/* * For uprobe @uprobe, delete the consumer @uc. * Should never be called with consumer that's not part of @uprobe->consumers.
*/ staticvoid consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
{
down_write(&uprobe->consumer_rwsem);
list_del_rcu(&uc->cons_node);
up_write(&uprobe->consumer_rwsem);
}
staticint __copy_insn(struct address_space *mapping, struct file *filp, void *insn, int nbytes, loff_t offset)
{ struct page *page; /* * Ensure that the page that has the original instruction is populated * and in page-cache. If ->read_folio == NULL it must be shmem_mapping(), * see uprobe_register().
*/ if (mapping->a_ops->read_folio)
page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else
page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page);
staticint prepare_uprobe(struct uprobe *uprobe, struct file *file, struct mm_struct *mm, unsignedlong vaddr)
{ int ret = 0;
if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) return ret;
/* TODO: move this into _register, until then we abuse this sem. */
down_write(&uprobe->consumer_rwsem); if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) goto out;
ret = copy_insn(uprobe, file); if (ret) goto out;
ret = -ENOTSUPP; if (is_trap_insn((uprobe_opcode_t *)&uprobe->arch.insn)) goto out;
ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); if (ret) goto out;
smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); if (ret) return ret;
/* * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), * the task can hit this breakpoint right after __replace_page().
*/
first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); if (first_uprobe)
set_bit(MMF_HAS_UPROBES, &mm->flags);
ret = set_swbp(&uprobe->arch, vma, vaddr); if (!ret)
clear_bit(MMF_RECALC_UPROBES, &mm->flags); elseif (first_uprobe)
clear_bit(MMF_HAS_UPROBES, &mm->flags);
if (!prev && !more) { /* * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion through * reclaim. This is optimistic, no harm done if it fails.
*/
prev = kmalloc(sizeof(struct map_info),
GFP_NOWAIT | __GFP_NOMEMALLOC | __GFP_NOWARN); if (prev)
prev->next = NULL;
} if (!prev) {
more++; continue;
}
if (err && is_register) goto free; /* * We take mmap_lock for writing to avoid the race with * find_active_uprobe_rcu() which takes mmap_lock for reading. * Thus this install_breakpoint() can not make * is_trap_at_addr() true right after find_uprobe_rcu() * returns NULL in find_active_uprobe_rcu().
*/
mmap_write_lock(mm); if (check_stable_address_space(mm)) goto unlock;
void uprobe_unregister_sync(void)
{ /* * Now that handler_chain() and handle_uretprobe_chain() iterate over * uprobe->consumers list under RCU protection without holding * uprobe->register_rwsem, we need to wait for RCU grace period to * make sure that we can't call into just unregistered * uprobe_consumer's callbacks anymore. If we don't do that, fast and * unlucky enough caller can free consumer's memory and cause * handler_chain() or handle_uretprobe_chain() to do an use-after-free.
*/
synchronize_rcu_tasks_trace();
synchronize_srcu(&uretprobes_srcu);
}
EXPORT_SYMBOL_GPL(uprobe_unregister_sync);
/** * uprobe_register - register a probe * @inode: the file in which the probe has to be placed. * @offset: offset from the start of the file. * @ref_ctr_offset: offset of SDT marker / reference counter * @uc: information on howto handle the probe.. * * Apart from the access refcount, uprobe_register() takes a creation * refcount (thro alloc_uprobe) if and only if this @uprobe is getting * inserted into the rbtree (i.e first consumer for a @inode:@offset * tuple). Creation refcount stops uprobe_unregister from freeing the * @uprobe even before the register operation is complete. Creation * refcount is released when the last @uc for the @uprobe * unregisters. Caller of uprobe_register() is required to keep @inode * (and the containing mount) referenced. * * Return: pointer to the new uprobe on success or an ERR_PTR on failure.
*/ struct uprobe *uprobe_register(struct inode *inode,
loff_t offset, loff_t ref_ctr_offset, struct uprobe_consumer *uc)
{ struct uprobe *uprobe; int ret;
/* Uprobe must have at least one set consumer */ if (!uc->handler && !uc->ret_handler) return ERR_PTR(-EINVAL);
/* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */ if (!inode->i_mapping->a_ops->read_folio &&
!shmem_mapping(inode->i_mapping)) return ERR_PTR(-EIO); /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return ERR_PTR(-EINVAL);
/* * This ensures that copy_from_page(), copy_to_page() and * __update_ref_ctr() can't cross page boundary.
*/ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE)) return ERR_PTR(-EINVAL); if (!IS_ALIGNED(ref_ctr_offset, sizeof(short))) return ERR_PTR(-EINVAL);
uprobe = alloc_uprobe(inode, offset, ref_ctr_offset); if (IS_ERR(uprobe)) return uprobe;
down_write(&uprobe->register_rwsem);
consumer_add(uprobe, uc);
ret = register_for_each_vma(uprobe, uc);
up_write(&uprobe->register_rwsem);
if (ret) {
uprobe_unregister_nosync(uprobe, uc); /* * Registration might have partially succeeded, so we can have * this consumer being called right at this time. We need to * sync here. It's ok, it's unlikely slow path.
*/
uprobe_unregister_sync(); return ERR_PTR(ret);
}
/** * uprobe_apply - add or remove the breakpoints according to @uc->filter * @uprobe: uprobe which "owns" the breakpoint * @uc: consumer which wants to add more or remove some breakpoints * @add: add or remove the breakpoints * Return: 0 on success or negative error code.
*/ int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add)
{ struct uprobe_consumer *con; int ret = -ENOENT;
if (inode < u->inode) {
n = n->rb_left;
} elseif (inode > u->inode) {
n = n->rb_right;
} else { if (max < u->offset)
n = n->rb_left; elseif (min > u->offset)
n = n->rb_right; else break;
}
}
return n;
}
/* * For a given range in vma, build a list of probes that need to be inserted.
*/ staticvoid build_probe_list(struct inode *inode, struct vm_area_struct *vma, unsignedlong start, unsignedlong end, struct list_head *head)
{
loff_t min, max; struct rb_node *n, *t; struct uprobe *u;
INIT_LIST_HEAD(head);
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
read_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max); if (n) { for (t = n; t; t = rb_prev(t)) {
u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset < min) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u))
list_add(&u->pending_list, head);
} for (t = n; (t = rb_next(t)); ) {
u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; /* if uprobe went away, it's safe to ignore it */ if (try_get_uprobe(u))
list_add(&u->pending_list, head);
}
}
read_unlock(&uprobes_treelock);
}
/* @vma contains reference counter, not the probed instruction. */ staticint delayed_ref_ctr_inc(struct vm_area_struct *vma)
{ struct list_head *pos, *q; struct delayed_uprobe *du; unsignedlong vaddr; int ret = 0, err = 0;
mutex_lock(&delayed_uprobe_lock);
list_for_each_safe(pos, q, &delayed_uprobe_list) {
du = list_entry(pos, struct delayed_uprobe, list);
if (du->mm != vma->vm_mm ||
!valid_ref_ctr_vma(du->uprobe, vma)) continue;
vaddr = offset_to_vaddr(vma, du->uprobe->ref_ctr_offset);
ret = __update_ref_ctr(vma->vm_mm, vaddr, 1); if (ret) {
update_ref_ctr_warn(du->uprobe, vma->vm_mm, 1); if (!err)
err = ret;
}
delayed_uprobe_delete(du);
}
mutex_unlock(&delayed_uprobe_lock); return err;
}
/* * Called from mmap_region/vma_merge with mm->mmap_lock acquired. * * Currently we ignore all errors and always return 0, the callers * can't handle the failure anyway.
*/ int uprobe_mmap(struct vm_area_struct *vma)
{ struct list_head tmp_list; struct uprobe *uprobe, *u; struct inode *inode;
inode = file_inode(vma->vm_file); if (!inode) return 0;
mutex_lock(uprobes_mmap_hash(inode));
build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); /* * We can race with uprobe_unregister(), this uprobe can be already * removed. But in this case filter_chain() must return false, all * consumers have gone away.
*/
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { if (!fatal_signal_pending(current) &&
filter_chain(uprobe, vma->vm_mm)) { unsignedlong vaddr = offset_to_vaddr(vma, uprobe->offset);
install_breakpoint(uprobe, vma, vaddr);
}
put_uprobe(uprobe);
}
mutex_unlock(uprobes_mmap_hash(inode));
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
read_lock(&uprobes_treelock);
n = find_node_in_range(inode, min, max);
read_unlock(&uprobes_treelock);
return !!n;
}
/* * Called in context of a munmap of a vma.
*/ void uprobe_munmap(struct vm_area_struct *vma, unsignedlong start, unsignedlong end)
{ if (no_uprobe_events() || !valid_vma(vma, false)) return;
if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ return;
if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) ||
test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) return;
if (vma_has_uprobes(vma, start, end))
set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags);
}
/* Slot allocation for XOL */ staticint xol_add_vma(struct mm_struct *mm, struct xol_area *area)
{ struct vm_area_struct *vma; int ret;
if (mmap_write_lock_killable(mm)) return -EINTR;
if (mm->uprobes_state.xol_area) {
ret = -EALREADY; goto fail;
}
if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */
area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
PAGE_SIZE, 0, 0); if (IS_ERR_VALUE(area->vaddr)) {
ret = area->vaddr; goto fail;
}
}
vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO|
VM_SEALED_SYSMAP,
&xol_mapping); if (IS_ERR(vma)) {
ret = PTR_ERR(vma); goto fail;
}
ret = 0; /* pairs with get_xol_area() */
smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */
fail:
mmap_write_unlock(mm);
/* * get_xol_area - Allocate process's xol_area if necessary. * This area will be used for storing instructions for execution out of line. * * Returns the allocated area or NULL.
*/ staticstruct xol_area *get_xol_area(void)
{ struct mm_struct *mm = current->mm; struct xol_area *area;
if (!mm->uprobes_state.xol_area)
__create_xol_area(0);
/* Pairs with xol_add_vma() smp_store_release() */
area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area;
}
/* * uprobe_clear_state - Free the area allocated for slots.
*/ void uprobe_clear_state(struct mm_struct *mm)
{ struct xol_area *area = mm->uprobes_state.xol_area;
/* * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function.
*/
flush_dcache_page(page);
}
/** * uprobe_get_swbp_addr - compute address of swbp given post-swbp regs * @regs: Reflects the saved state of the task after it has hit a breakpoint * instruction. * Return the address of the breakpoint instruction.
*/ unsignedlong __weak uprobe_get_swbp_addr(struct pt_regs *regs)
{ return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}
/* * At this point return_instance is unlinked from utask's * return_instances list and this has become visible to ri_timer(). * If seqcount now indicates that ri_timer's return instance * processing loop isn't active, we can return ri into the pool of * to-be-reused return instances for future uretprobes. If ri_timer() * happens to be running right now, though, we fallback to safety and * just perform RCU-delated freeing of ri. * Admittedly, this is a rather simple use of seqcount, but it nicely * abstracts away all the necessary memory barriers, so we use * a well-supported kernel primitive here.
*/ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) { /* immediate reuse of ri without RCU GP is OK */
ri_pool_push(utask, ri);
} else { /* we might be racing with ri_timer(), so play it safe */
ri_free(ri);
}
}
/* * Called with no locks held. * Called in context of an exiting or an exec-ing thread.
*/ void uprobe_free_utask(struct task_struct *t)
{ struct uprobe_task *utask = t->utask; struct return_instance *ri, *ri_next;
ri = utask->return_instances; while (ri) {
ri_next = ri->next;
free_ret_instance(utask, ri, true/* cleanup_hprobe */);
ri = ri_next;
}
/* free_ret_instance() above might add to ri_pool, so this loop should come last */
ri = utask->ri_pool; while (ri) {
ri_next = ri->next;
ri_free(ri);
ri = ri_next;
}
/* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */
guard(srcu)(&uretprobes_srcu); /* RCU protects return_instance from freeing. */
guard(rcu)();
/* * See free_ret_instance() for notes on seqcount use. * We also employ raw API variants to avoid lockdep false-positive * warning complaining about enabled preemption. The timer can only be * invoked once for a uprobe_task. Therefore there can only be one * writer. The reader does not require an even sequence count to make * progress, so it is OK to remain preemptible on PREEMPT_RT.
*/
raw_write_seqcount_begin(&utask->ri_seqcount);
/* * Allocate a uprobe_task object for the task if necessary. * Called when the thread hits a breakpoint. * * Returns: * - pointer to new uprobe_task on success * - NULL otherwise
*/ staticstruct uprobe_task *get_utask(void)
{ if (!current->utask)
current->utask = alloc_utask(); return current->utask;
}
n_utask = alloc_utask(); if (!n_utask) return -ENOMEM;
t->utask = n_utask;
/* protect uprobes from freeing, we'll need try_get_uprobe() them */
guard(srcu)(&uretprobes_srcu);
p = &n_utask->return_instances; for (o = o_utask->return_instances; o; o = o->next) {
n = dup_return_instance(o); if (!n) return -ENOMEM;
/* if uprobe is non-NULL, we'll have an extra refcount for uprobe */
uprobe = hprobe_expire(&o->hprobe, true);
/* * New utask will have stable properly refcounted uprobe or * NULL. Even if we failed to get refcounted uprobe, we still * need to preserve full set of return_instances for proper * uretprobe handling and nesting in forked task.
*/
hprobe_init_stable(&n->hprobe, uprobe);
n->next = NULL;
rcu_assign_pointer(*p, n);
p = &n->next;
n_utask->depth++;
}
return 0;
}
staticvoid dup_xol_work(struct callback_head *work)
{ if (current->flags & PF_EXITING) return;
if (!__create_xol_area(current->utask->dup_xol_addr) &&
!fatal_signal_pending(current))
uprobe_warn(current, "dup xol area");
}
/* * Called in context of a new clone/fork from copy_process.
*/ void uprobe_copy_process(struct task_struct *t, unsignedlong flags)
{ struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; struct xol_area *area;
t->utask = NULL;
if (!utask || !utask->return_instances) return;
if (mm == t->mm && !(flags & CLONE_VFORK)) return;
if (dup_utask(t, utask)) return uprobe_warn(t, "dup ret instances");
/* The task can fork() after dup_xol_work() fails */
area = mm->uprobes_state.xol_area; if (!area) return uprobe_warn(t, "dup xol area");
/* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. * * Returns -1 in case the xol_area is not allocated.
*/ unsignedlong uprobe_get_trampoline_vaddr(void)
{ unsignedlong trampoline_vaddr = UPROBE_NO_TRAMPOLINE_VADDR; struct xol_area *area;
/* Pairs with xol_add_vma() smp_store_release() */
area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area)
trampoline_vaddr = area->vaddr;
/* drop the entries invalidated by longjmp() */
chained = (orig_ret_vaddr == trampoline_vaddr);
cleanup_return_instances(utask, chained, regs);
/* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier.
*/ if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an * attack from user-space.
*/
uprobe_warn(current, "handle tail call"); goto free;
}
orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
}
/* __srcu_read_lock() because SRCU lock survives switch to user space */
srcu_idx = __srcu_read_lock(&uretprobes_srcu);
/* * If we are singlestepping, then ensure this thread is not connected to * non-fatal signals until completion of singlestep. When xol insn itself * triggers the signal, restart the original insn even if the task is * already SIGKILL'ed (since coredump should report the correct ip). This * is even more important if the task has a handler for SIGSEGV/etc, The * _same_ instruction should be repeated again after return from the signal * handler, and SSTEP can never finish in this case.
*/ bool uprobe_deny_signal(void)
{ struct task_struct *t = current; struct uprobe_task *utask = t->utask;
if (likely(!utask || !utask->active_uprobe)) returnfalse;
WARN_ON_ONCE(utask->state != UTASK_SSTEP);
if (task_sigpending(t)) {
utask->signal_denied = true;
clear_tsk_thread_flag(t, TIF_SIGPENDING);
for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* * This is not strictly accurate, we can race with * uprobe_unregister() and see the already removed * uprobe if delete_uprobe() was not yet called. * Or this uprobe can be filtered out.
*/ if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) return;
}
if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE))) return -EINVAL;
pagefault_disable();
result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
pagefault_enable();
if (likely(result == 0)) goto out;
result = get_user_pages(vaddr, 1, FOLL_FORCE, &page); if (result < 0) return result;
copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out: /* This needs to return true for any variant of the trap insn */ return is_trap_insn(&opcode);
}
if (!mmap_lock_speculate_try_begin(mm, &seq)) return NULL;
vma = vma_lookup(mm, bp_vaddr); if (!vma) return NULL;
/* * vm_file memory can be reused for another instance of struct file, * but can't be freed from under us, so it's safe to read fields from * it, even if the values are some garbage values; ultimately * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure * that whatever we speculatively found is correct
*/
vm_file = READ_ONCE(vma->vm_file); if (!vm_file) return NULL;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.