/* * #!-checking implemented by tytso.
*/ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats.
*/
bool path_noexec(conststruct path *path)
{ /* If it's an anonymous inode make sure that we catch any shenanigans. */
VFS_WARN_ON_ONCE(IS_ANON_FILE(d_inode(path->dentry)) &&
!(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC)); return (path->mnt->mnt_flags & MNT_NOEXEC) ||
(path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
}
#ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0).
*/ staticvoid acct_arg_size(struct linux_binprm *bprm, unsignedlong pages)
{ struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages);
/* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it * ahead of time.
*/ if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) return NULL;
/* * We are doing an exec(). 'current' is the process * doing the exec and 'mm' is the new process's mm.
*/
ret = get_user_pages_remote(mm, pos, 1,
write ? FOLL_WRITE : 0,
&page, NULL);
mmap_read_unlock(mm); if (ret <= 0) return NULL;
staticbool valid_arg_len(struct linux_binprm *bprm, long len)
{ return len <= bprm->p;
}
#endif/* CONFIG_MMU */
/* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages().
*/ staticint bprm_mm_init(struct linux_binprm *bprm)
{ int err; struct mm_struct *mm = NULL;
bprm->mm = mm = mm_alloc();
err = -ENOMEM; if (!mm) goto err;
/* Save current stack limit for all calculations made during exec. */
task_lock(current->group_leader);
bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK];
task_unlock(current->group_leader);
#ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) {
compat_uptr_t compat;
if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT);
return compat_ptr(compat);
} #endif
if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT);
return native;
}
/* * count() counts the number of strings in array ARGV.
*/ staticint count(struct user_arg_ptr argv, int max)
{ int i = 0;
if (argv.ptr.native != NULL) { for (;;) { constchar __user *p = get_user_arg_ptr(argv, i);
if (!p) break;
if (IS_ERR(p)) return -EFAULT;
if (i >= max) return -E2BIG;
++i;
if (fatal_signal_pending(current)) return -ERESTARTNOHAND;
cond_resched();
}
} return i;
}
staticint count_strings_kernel(constchar *const *argv)
{ int i;
if (!argv) return 0;
for (i = 0; argv[i]; ++i) { if (i >= MAX_ARG_STRINGS) return -E2BIG; if (fatal_signal_pending(current)) return -ERESTARTNOHAND;
cond_resched();
} return i;
}
/* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from.
*/
limit = _STK_LIM / 4 * 3;
limit = min(limit, bprm->rlim_stack.rlim_cur / 4); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks
*/
limit = max_t(unsignedlong, limit, ARG_MAX); /* Reject totally pathological counts. */ if (bprm->argc < 0 || bprm->envc < 0) return -E2BIG; /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. * * In the case of argc = 0, make sure there is space for adding a * empty string (which will bump argc to 1), to ensure confused * userspace programs don't start processing from argv[1], thinking * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common().
*/ if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) ||
check_mul_overflow(ptr_size, sizeof(void *), &ptr_size)) return -E2BIG; if (limit <= ptr_size) return -E2BIG;
limit -= ptr_size;
return bprm_set_stack_limit(bprm, limit);
}
/* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out.
*/ staticint copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm)
{ struct page *kmapped_page = NULL; char *kaddr = NULL; unsignedlong kpos = 0; int ret;
while (argc-- > 0) { constchar __user *str; int len; unsignedlong pos;
ret = -EFAULT;
str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out;
len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out;
ret = -E2BIG; if (!valid_arg_len(bprm, len)) goto out;
/* We're going to work our way backwards. */
pos = bprm->p;
str += len;
bprm->p -= len; if (bprm_hit_stack_limit(bprm)) goto out;
while (len > 0) { int offset, bytes_to_copy;
if (fatal_signal_pending(current)) {
ret = -ERESTARTNOHAND; goto out;
}
cond_resched();
page = get_arg_page(bprm, pos, 1); if (!page) {
ret = -E2BIG; goto out;
}
if (kmapped_page) {
flush_dcache_page(kmapped_page);
kunmap_local(kaddr);
put_arg_page(kmapped_page);
}
kmapped_page = page;
kaddr = kmap_local_page(kmapped_page);
kpos = pos & PAGE_MASK;
flush_arg_page(bprm, kpos, kmapped_page);
} if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
ret = -EFAULT; goto out;
}
}
}
ret = 0;
out: if (kmapped_page) {
flush_dcache_page(kmapped_page);
kunmap_local(kaddr);
put_arg_page(kmapped_page);
} return ret;
}
/* * Copy and argument/environment string from the kernel to the processes stack.
*/ int copy_string_kernel(constchar *arg, struct linux_binprm *bprm)
{ int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; unsignedlong pos = bprm->p;
if (len == 0) return -EFAULT; if (!valid_arg_len(bprm, len)) return -E2BIG;
/* We're going to work our way backwards. */
arg += len;
bprm->p -= len; if (bprm_hit_stack_limit(bprm)) return -E2BIG;
staticint copy_strings_kernel(int argc, constchar *const *argv, struct linux_binprm *bprm)
{ while (argc-- > 0) { int ret = copy_string_kernel(argv[argc], bprm); if (ret < 0) return ret; if (fatal_signal_pending(current)) return -ERESTARTNOHAND;
cond_resched();
} return 0;
}
#ifdef CONFIG_MMU
/* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added.
*/ int setup_arg_pages(struct linux_binprm *bprm, unsignedlong stack_top, int executable_stack)
{ int ret; unsignedlong stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL;
vm_flags_t vm_flags; unsignedlong stack_base; unsignedlong stack_size; unsignedlong stack_expand; unsignedlong rlim_stack; struct mmu_gather tlb; struct vma_iterator vmi;
if (unlikely(vm_flags & VM_EXEC)) {
pr_warn_once("process '%pD4' started with executable stack\n",
bprm->file);
}
/* Move stack pages down in memory. */ if (stack_shift) { /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location.
*/
ret = relocate_vma_down(vma, stack_shift); if (ret) goto out_unlock;
}
/* mprotect_fixup is overkill to remove the temporary stack flags */
vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP);
stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up.
*/
rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK;
/* * Transfer the program arguments and environment from the holding pages * onto the stack. The provided stack pointer is adjusted accordingly.
*/ int transfer_args_to_stack(struct linux_binprm *bprm, unsignedlong *sp_location)
{ unsignedlong index, stop, sp; int ret = 0;
stop = bprm->p >> PAGE_SHIFT;
sp = *sp_location;
for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsignedint offset = index == stop ? bprm->p & ~PAGE_MASK : 0; char *src = kmap_local_page(bprm->page[index]) + offset;
sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
ret = -EFAULT;
kunmap_local(src); if (ret) goto out;
}
/* * On success, caller must call do_close_execat() on the returned * struct file to close it.
*/ staticstruct file *do_open_execat(int fd, struct filename *name, int flags)
{ int err; struct file *file __free(fput) = NULL; struct open_flags open_exec_flags = {
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
.acc_mode = MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
};
if ((flags &
~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_EXECVE_CHECK)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW)
open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH)
open_exec_flags.lookup_flags |= LOOKUP_EMPTY;
file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) return file;
if (path_noexec(&file->f_path)) return ERR_PTR(-EACCES);
/* * In the past the regular type check was here. It moved to may_open() in * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is * an invariant that all non-regular files error out before we get here.
*/ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode))) return ERR_PTR(-EACCES);
err = exe_file_deny_write_access(file); if (err) return ERR_PTR(err);
return no_free_ptr(file);
}
/** * open_exec - Open a path name for execution * * @name: path name to open with the intent of executing it. * * Returns ERR_PTR on failure or allocated struct file on success. * * As this is a wrapper for the internal do_open_execat(), callers * must call exe_file_allow_write_access() before fput() on release. Also see * do_close_execat().
*/ struct file *open_exec(constchar *name)
{ struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename);
if (!IS_ERR(filename)) {
f = do_open_execat(AT_FDCWD, filename, 0);
putname(filename);
} return f;
}
EXPORT_SYMBOL(open_exec);
/* * Maps the mm_struct mm into the current task struct. * On success, this function returns with exec_update_lock * held for writing.
*/ staticint exec_mmap(struct mm_struct *mm)
{ struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret;
/* Notify parent that we're no longer interested in the old VM */
tsk = current;
old_mm = current->mm;
exec_mm_release(tsk, old_mm);
ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret;
if (old_mm) { /* * If there is a pending fatal signal perhaps a signal * whose default action is to create a coredump get * out and die instead of going through with the exec.
*/
ret = mmap_read_lock_killable(old_mm); if (ret) {
up_write(&tsk->signal->exec_update_lock); return ret;
}
}
task_lock(tsk);
membarrier_exec_mmap(mm);
local_irq_disable();
active_mm = tsk->active_mm;
tsk->active_mm = mm;
tsk->mm = mm;
mm_init_cid(mm, tsk); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for * lazy tlb mm refcounting when these are updated by context * switches. Not all architectures can handle irqs off over * activate_mm yet.
*/ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
local_irq_enable();
lru_gen_add_mm(mm);
task_unlock(tsk);
lru_gen_use_mm(mm); if (old_mm) {
mmap_read_unlock(old_mm);
BUG_ON(active_mm != old_mm);
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm); return 0;
}
mmdrop_lazy_tlb(active_mm); return 0;
}
if (thread_group_empty(tsk)) goto no_thread_group;
/* * Kill all other threads in the thread group.
*/
spin_lock_irq(lock); if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed.
*/
spin_unlock_irq(lock); return -EAGAIN;
}
sig->group_exec_task = tsk;
sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk))
sig->notify_count--;
while (sig->notify_count) {
__set_current_state(TASK_KILLABLE);
spin_unlock_irq(lock);
schedule(); if (__fatal_signal_pending(tsk)) goto killed;
spin_lock_irq(lock);
}
spin_unlock_irq(lock);
/* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID:
*/ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader;
for (;;) {
cgroup_threadgroup_change_begin(tsk);
write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exec_task
*/
sig->notify_count = -1; if (likely(leader->exit_state)) break;
__set_current_state(TASK_KILLABLE);
write_unlock_irq(&tasklist_lock);
cgroup_threadgroup_change_end(tsk);
schedule(); if (__fatal_signal_pending(tsk)) goto killed;
}
/* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own).
*/
tsk->start_time = leader->start_time;
tsk->start_boottime = leader->start_boottime;
BUG_ON(!same_thread_group(leader, tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader:
*/
/* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group.
*/
exchange_tids(tsk, leader);
transfer_pid(leader, tsk, PIDTYPE_TGID);
transfer_pid(leader, tsk, PIDTYPE_PGID);
transfer_pid(leader, tsk, PIDTYPE_SID);
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer won't block again waiting for this thread.
*/ if (unlikely(leader->ptrace))
__wake_up_parent(leader, leader->parent);
write_unlock_irq(&tasklist_lock);
cgroup_threadgroup_change_end(tsk);
no_thread_group: /* we have changed execution domain */
tsk->exit_signal = SIGCHLD;
BUG_ON(!thread_group_leader(tsk)); return 0;
killed: /* protects against exit_notify() and __exit_signal() */
read_lock(&tasklist_lock);
sig->group_exec_task = NULL;
sig->notify_count = 0;
read_unlock(&tasklist_lock); return -EAGAIN;
}
/* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().)
*/ staticint unshare_sighand(struct task_struct *me)
{ struct sighand_struct *oldsighand = me->sighand;
if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one.
*/
newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM;
/* * This is unlocked -- the string will always be NUL-terminated, but * may show overlapping contents if racing concurrent reads.
*/ void __set_task_comm(struct task_struct *tsk, constchar *buf, bool exec)
{
size_t len = min(strlen(buf), sizeof(tsk->comm) - 1);
/* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handler (see below).
*/ int begin_new_exec(struct linux_binprm * bprm)
{ struct task_struct *me = current; int retval;
/* Once we are committed compute the creds */
retval = bprm_creds_from_file(bprm); if (retval) return retval;
/* * This tracepoint marks the point before flushing the old exec where * the current task is still unchanged, but errors are fatal (point of * no return). The later "sched_process_exec" tracepoint is called after * the current task has successfully switched to the new exec.
*/
trace_sched_prepare_exec(current, bprm);
/* * Ensure all future errors are fatal.
*/
bprm->point_of_no_return = true;
/* Make this the only thread in the thread group */
retval = de_thread(me); if (retval) goto out; /* see the comment in check_unsafe_exec() */
current->fs->in_exec = 0; /* * Cancel any io_uring activity across execve
*/
io_uring_task_cancel();
/* Ensure the files table is not shared. */
retval = unshare_files(); if (retval) goto out;
/* * Must be called _before_ exec_mmap() as bprm->mm is * not visible until then. Doing it here also ensures * we don't race against replace_mm_exe_file().
*/
retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) goto out;
/* If the binary is not readable then enforce mm->dumpable=0 */
would_dump(bprm, bprm->file); if (bprm->have_execfd)
would_dump(bprm, bprm->executable);
/* * Release all of the old mmap stuff
*/
acct_arg_size(bprm, 0);
retval = exec_mmap(bprm->mm); if (retval) goto out;
bprm->mm = NULL;
retval = exec_task_namespaces(); if (retval) goto out_unlock;
/* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2).
*/
do_close_on_exec(me->files);
if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */
me->pdeath_signal = 0;
/* * For secureexec, reset the stack limit to sane default to * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure.
*/ if (bprm->rlim_stack.rlim_cur > _STK_LIM)
bprm->rlim_stack.rlim_cur = _STK_LIM;
}
me->sas_ss_sp = me->sas_ss_size = 0;
/* * Figure out dumpability. Note that this checking only of current * is wrong, but userspace depends on it. This should be testing * bprm->secureexec instead.
*/ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP ||
!(uid_eq(current_euid(), current_uid()) &&
gid_eq(current_egid(), current_gid())))
set_dumpable(current->mm, suid_dumpable); else
set_dumpable(current->mm, SUID_DUMP_USER);
perf_event_exec();
/* * If the original filename was empty, alloc_bprm() made up a path * that will probably not be useful to admins running ps or similar. * Let's fix it up to be something reasonable.
*/ if (bprm->comm_from_dentry) { /* * Hold RCU lock to keep the name from being freed behind our back. * Use acquire semantics to make sure the terminating NUL from * __d_alloc() is seen. * * Note, we're deliberately sloppy here. We don't need to care about * detecting a concurrent rename and just want a terminated name.
*/
rcu_read_lock();
__set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name), true);
rcu_read_unlock();
} else {
__set_task_comm(me, kbasename(bprm->filename), true);
}
/* An exec changes our domain. We are no longer part of the thread
group */
WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
flush_signal_handlers(me, 0);
retval = set_cred_ucounts(bprm->cred); if (retval < 0) goto out_unlock;
/* * install the new credentials for this executable
*/
security_bprm_committing_creds(bprm);
commit_creds(bprm->cred);
bprm->cred = NULL;
/* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above
*/ if (get_dumpable(me->mm) != SUID_DUMP_USER)
perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked.
*/
security_bprm_committed_creds(bprm);
/* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) {
retval = get_unused_fd_flags(0); if (retval < 0) goto out_unlock;
fd_install(retval, bprm->executable);
bprm->executable = NULL;
bprm->execfd = retval;
} return 0;
out_unlock:
up_write(&me->signal->exec_update_lock); if (!bprm->cred)
mutex_unlock(&me->signal->cred_guard_mutex);
void setup_new_exec(struct linux_binprm * bprm)
{ /* Setup things that can depend upon the personality */ struct task_struct *me = current;
arch_pick_mmap_layout(me->mm, &bprm->rlim_stack);
arch_setup_new_exec();
/* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc
*/
me->mm->task_size = TASK_SIZE;
up_write(&me->signal->exec_update_lock);
mutex_unlock(&me->signal->cred_guard_mutex);
}
EXPORT_SYMBOL(setup_new_exec);
/* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm)
{ /* Store any stack rlimit changes before starting thread. */
task_lock(current->group_leader);
current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack;
task_unlock(current->group_leader);
}
EXPORT_SYMBOL(finalize_exec);
/* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred * and unlock.
*/ staticint prepare_bprm_creds(struct linux_binprm *bprm)
{ if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex)) return -ERESTARTNOINTR;
bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0;
staticvoid free_bprm(struct linux_binprm *bprm)
{ if (bprm->mm) {
acct_arg_size(bprm, 0);
mmput(bprm->mm);
}
free_arg_pages(bprm); if (bprm->cred) { /* in case exec fails before de_thread() succeeds */
current->fs->in_exec = 0;
mutex_unlock(¤t->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
do_close_execat(bprm->file); if (bprm->executable)
fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename)
kfree(bprm->interp);
kfree(bprm->fdpath);
kfree(bprm);
}
staticstruct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags)
{ struct linux_binprm *bprm; struct file *file; int retval = -ENOMEM;
file = do_open_execat(fd, filename, flags); if (IS_ERR(file)) return ERR_CAST(file);
/* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. This allows the code in exec to * choose to fail when the executable is not mmaped into the * interpreter and an open file descriptor is not passed to * the interpreter. This makes for a better user experience * than having the interpreter start and then immediately fail * when it finds the executable is inaccessible.
*/ if (get_close_on_exec(fd))
bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
/* * At this point, security_file_open() has already been called (with * __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will * stop just after the security_bprm_creds_for_exec() call in * bprm_execve(). Indeed, the kernel should not try to parse the * content of the file with exec_binprm() nor change the calling * thread, which means that the following security functions will not * be called: * - security_bprm_check() * - security_bprm_creds_from_file() * - security_bprm_committing_creds() * - security_bprm_committed_creds()
*/
bprm->is_check = !!(flags & AT_EXECVE_CHECK);
retval = bprm_mm_init(bprm); if (!retval) return bprm;
int bprm_change_interp(constchar *interp, struct linux_binprm *bprm)
{ /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename)
kfree(bprm->interp);
bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) return -ENOMEM; return 0;
}
EXPORT_SYMBOL(bprm_change_interp);
/* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync
*/ staticvoid check_unsafe_exec(struct linux_binprm *bprm)
{ struct task_struct *p = current, *t; unsigned n_fs;
if (p->ptrace)
bprm->unsafe |= LSM_UNSAFE_PTRACE;
/* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up.
*/ if (task_no_new_privs(current))
bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS;
/* * If another task is sharing our fs, we cannot safely * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... * * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS) * from another sub-thread until de_thread() succeeds, this * state is protected by cred_guard_mutex we hold.
*/
n_fs = 1;
read_seqlock_excl(&p->fs->seq);
rcu_read_lock();
for_other_threads(p, t) { if (t->fs == p->fs)
n_fs++;
}
rcu_read_unlock();
/* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs)
bprm->unsafe |= LSM_UNSAFE_SHARE; else
p->fs->in_exec = 1;
read_sequnlock_excl(&p->fs->seq);
}
mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return;
idmap = file_mnt_idmap(file);
/* Be careful if suid/sgid is set */
inode_lock(inode);
/* Atomically reload and check mode/uid/gid now that lock held. */
mode = inode->i_mode;
vfsuid = i_uid_into_vfsuid(idmap, inode);
vfsgid = i_gid_into_vfsgid(idmap, inode);
err = inode_permission(idmap, inode, MAY_EXEC);
inode_unlock(inode);
/* Did the exec bit vanish out from under us? Give up. */ if (err) return;
/* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) ||
!vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return;
/* * Compute brpm->cred based upon the final binary.
*/ staticint bprm_creds_from_file(struct linux_binprm *bprm)
{ /* Compute creds based on which file? */ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file;
/* * Fill the binprm structure from the inode. * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example).
*/ staticint prepare_binprm(struct linux_binprm *bprm)
{
loff_t pos = 0;
/* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered.
*/ int remove_arg_zero(struct linux_binprm *bprm)
{ unsignedlong offset; char *kaddr; struct page *page;
if (!bprm->argc) return 0;
do {
offset = bprm->p & ~PAGE_MASK;
page = get_arg_page(bprm, bprm->p, 0); if (!page) return -EFAULT;
kaddr = kmap_local_page(page);
for (; offset < PAGE_SIZE && kaddr[offset];
offset++, bprm->p++)
;
kunmap_local(kaddr);
put_arg_page(page);
} while (offset == PAGE_SIZE);
bprm->p++;
bprm->argc--;
return 0;
}
EXPORT_SYMBOL(remove_arg_zero);
/* * cycle the list of binary formats handler, until one recognizes the image
*/ staticint search_binary_handler(struct linux_binprm *bprm)
{ struct linux_binfmt *fmt; int retval;
retval = prepare_binprm(bprm); if (retval < 0) return retval;
retval = security_bprm_check(bprm); if (retval) return retval;
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) { if (!try_module_get(fmt->module)) continue;
read_unlock(&binfmt_lock);
/* binfmt handlers will call back into begin_new_exec() on success. */ staticint exec_binprm(struct linux_binprm *bprm)
{
pid_t old_pid, old_vpid; int ret, depth;
/* Need to fetch pid before load_binary changes it */
old_pid = current->pid;
rcu_read_lock();
old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
rcu_read_unlock();
/* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP;
ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break;
staticint bprm_execve(struct linux_binprm *bprm)
{ int retval;
retval = prepare_bprm_creds(bprm); if (retval) return retval;
/* * Check for unsafe execution states before exec_binprm(), which * will call back into begin_new_exec(), into bprm_creds_from_file(), * where setuid-ness is evaluated.
*/
check_unsafe_exec(bprm);
current->in_execve = 1;
sched_mm_cid_before_execve(current);
sched_exec();
/* Set the unchanging part of bprm->cred */
retval = security_bprm_creds_for_exec(bprm); if (retval || bprm->is_check) goto out;
retval = exec_binprm(bprm); if (retval < 0) goto out;
out: /* * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV.
*/ if (bprm->point_of_no_return && !fatal_signal_pending(current))
force_fatal_sig(SIGSEGV);
staticint do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags)
{ struct linux_binprm *bprm; int retval;
if (IS_ERR(filename)) return PTR_ERR(filename);
/* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded.
*/ if ((current->flags & PF_NPROC_EXCEEDED) &&
is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
retval = -EAGAIN; goto out_ret;
}
/* We're below the limit (still or again), so we don't want to make
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;
/* * When argv is empty, add an empty string ("") as argv[0] to * ensure confused userspace programs that start processing * from argv[1] won't end up walking envp. See also * bprm_stack_limits().
*/ if (bprm->argc == 0) {
retval = copy_string_kernel("", bprm); if (retval < 0) goto out_free;
bprm->argc = 1;
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.