/* * 'fork.c' contains the help-routines for the 'fork' system call * (see also entry.S and others). * Fork is rather simple, once you get the hang of it, but the memory * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
*/
/* * Minimum number of threads to boot the kernel
*/ #define MIN_THREADS 20
/* * Maximum number of threads
*/ #define MAX_THREADS FUTEX_TID_MASK
/* * Protected counters by write_lock_irq(&tasklist_lock)
*/ unsignedlong total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */
staticint max_threads; /* tunable limit on nr_threads */
#ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks.
*/ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); /* * Allocated stacks are cached and later reused by new threads, so memcg * accounting is performed by the code assigning/releasing stacks to tasks. * We need a zeroed memory without __GFP_ACCOUNT.
*/ #define GFP_VMAP_STACK (GFP_KERNEL | __GFP_ZERO)
for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
ret = memcg_kmem_charge_page(vm_area->pages[i], GFP_KERNEL, 0); if (ret) goto err;
nr_charged++;
} return 0;
err: for (i = 0; i < nr_charged; i++)
memcg_kmem_uncharge_page(vm_area->pages[i], 0); return ret;
}
staticint alloc_thread_stack_node(struct task_struct *tsk, int node)
{ struct vm_struct *vm_area; void *stack; int i;
for (i = 0; i < NR_CACHED_STACKS; i++) {
vm_area = this_cpu_xchg(cached_stacks[i], NULL); if (!vm_area) continue;
vm_area = find_vm_area(stack); if (memcg_charge_kernel_stack(vm_area)) {
vfree(stack); return -ENOMEM;
} /* * We can't call find_vm_area() in interrupt context, and * free_thread_stack() can be called in interrupt context, * so cache the vm_struct.
*/
tsk->stack_vm_area = vm_area;
stack = kasan_reset_tag(stack);
tsk->stack = stack; return 0;
}
staticvoid free_thread_stack(struct task_struct *tsk)
{ if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
thread_stack_delayed_free(tsk);
tsk->stack = NULL;
tsk->stack_vm_area = NULL;
}
#else/* !CONFIG_VMAP_STACK */
/* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator.
*/ #if THREAD_SIZE >= PAGE_SIZE
if (IS_ENABLED(CONFIG_VMAP_STACK)) { struct vm_struct *vm_area; int i;
vm_area = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
memcg_kmem_uncharge_page(vm_area->pages[i], 0);
}
}
staticvoid release_task_stack(struct task_struct *tsk)
{ if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) return; /* Better to leak the stack than to free prematurely */
#ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, * so free both.
*/
release_task_stack(tsk); #else /* * If the task had a separate stack allocation, it should be gone * by now.
*/
WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); #endif
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD)
free_kthread_struct(tsk);
bpf_task_storage_free(tsk);
free_task_struct(tsk);
}
EXPORT_SYMBOL(free_task);
exe_file = get_mm_exe_file(oldmm);
RCU_INIT_POINTER(mm->exe_file, exe_file); /* * We depend on the oldmm having properly denied write access to the * exe_file already.
*/ if (exe_file && exe_file_deny_write_access(exe_file))
pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
}
staticinlineint mm_alloc_id(struct mm_struct *mm)
{ int ret;
ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL); if (ret < 0) return ret;
mm->mm_id = ret; return 0;
}
staticinlinevoid mm_free_id(struct mm_struct *mm)
{ const mm_id_t id = mm->mm_id;
mm->mm_id = MM_ID_DUMMY; if (id == MM_ID_DUMMY) return; if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX)) return;
ida_free(&mm_ida, id);
} #else/* !CONFIG_MM_ID */ staticinlineint mm_alloc_id(struct mm_struct *mm) { return 0; } staticinlinevoid mm_free_id(struct mm_struct *mm) {} #endif/* CONFIG_MM_ID */
staticvoid check_mm(struct mm_struct *mm)
{ int i;
BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, "Please make sure 'struct resident_page_types[]' is updated as well");
for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]);
if (unlikely(x)) {
pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld Comm:%s Pid:%d\n",
mm, resident_page_types[i], x,
current->comm,
task_pid_nr(current));
}
}
if (mm_pgtables_bytes(mm))
pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
mm_pgtables_bytes(mm));
staticvoid cleanup_lazy_tlbs(struct mm_struct *mm)
{ if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { /* * In this case, lazy tlb mms are refounted and would not reach * __mmdrop until all CPUs have switched away and mmdrop()ed.
*/ return;
}
/* * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it * requires lazy mm users to switch to another mm when the refcount * drops to zero, before the mm is freed. This requires IPIs here to * switch kernel threads to init_mm. * * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm * switch with the final userspace teardown TLB flush which leaves the * mm lazy on this CPU but no others, reducing the need for additional * IPIs here. There are cases where a final IPI is still required here, * such as the final mmdrop being performed on a different CPU than the * one exiting, or kernel threads using the mm when userspace exits. * * IPI overheads have not found to be expensive, but they could be * reduced in a number of possible ways, for example (roughly * increasing order of complexity): * - The last lazy reference created by exit_mm() could instead switch * to init_mm, however it's probable this will run on the same CPU * immediately afterwards, so this may not reduce IPIs much. * - A batch of mms requiring IPIs could be gathered and freed at once. * - CPUs store active_mm where it can be remotely checked without a * lock, to filter out false-positives in the cpumask. * - After mm_users or mm_count reaches zero, switching away from the * mm could clear mm_cpumask to reduce some IPIs, perhaps together * with some batching or delaying of the final IPIs. * - A delayed freeing and RCU-like quiescing sequence based on mm * switching to avoid IPIs completely.
*/
on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1); if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES))
on_each_cpu(do_check_lazy_tlb, (void *)mm, 1);
}
/* * Called when the last reference to the mm * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm.
*/ void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
WARN_ON_ONCE(mm == current->mm);
/* Ensure no CPUs are using this as their lazy tlb mm */
cleanup_lazy_tlbs(mm);
staticinlinevoid free_signal_struct(struct signal_struct *sig)
{
taskstats_tgid_free(sig);
sched_autogroup_exit(sig); /* * __mmdrop is not safe to call from softirq context on x86 due to * pgd_dtor so postpone it to the async context
*/ if (sig->oom_mm)
mmdrop_async(sig->oom_mm);
kmem_cache_free(signal_cachep, sig);
}
staticinlinevoid put_signal_struct(struct signal_struct *sig)
{ if (refcount_dec_and_test(&sig->sigcnt))
free_signal_struct(sig);
}
/* * The number of threads shall be limited such that the thread * structures may only consume a small part of the available memory.
*/ if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64)
threads = MAX_THREADS; else
threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE,
(u64) THREAD_SIZE * 8UL);
if (threads > max_threads_suggested)
threads = max_threads_suggested;
/* create a slab on which task_structs can be allocated */
task_struct_whitelist(&useroffset, &usersize);
task_struct_cachep = kmem_cache_create_usercopy("task_struct",
arch_task_struct_size, align,
SLAB_PANIC|SLAB_ACCOUNT,
useroffset, usersize, NULL);
/* do the arch specific task caches init */
arch_task_cache_init();
err = scs_prepare(tsk, node); if (err) goto free_stack;
#ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under * the sighand lock in case orig has changed between now and * then. Until then, filter must be NULL to avoid messing up * the usage counts on the error path calling free_task.
*/
tsk->seccomp.filter = NULL; #endif
/* * One for the user space visible state that goes away when reaped. * One for the scheduler.
*/
refcount_set(&tsk->rcu_users, 2); /* One for the rcu users */
refcount_set(&tsk->usage, 1); #ifdef CONFIG_BLK_DEV_IO_TRACE
tsk->btrace_seq = 0; #endif
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
tsk->worker_private = NULL;
/** * set_mm_exe_file - change a reference to the mm's executable file * @mm: The mm to change. * @new_exe_file: The new file to use. * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve it happens before * the new mm is made visible to anyone. * * Can only fail if new_exe_file != NULL.
*/ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{ struct file *old_exe_file;
/* * It is safe to dereference the exe_file without RCU as * this function is only called if nobody else can access * this mm -- see comment above for justification.
*/
old_exe_file = rcu_dereference_raw(mm->exe_file);
if (new_exe_file) { /* * We expect the caller (i.e., sys_execve) to already denied * write access, so this is unlikely to fail.
*/ if (unlikely(exe_file_deny_write_access(new_exe_file))) return -EACCES;
get_file(new_exe_file);
}
rcu_assign_pointer(mm->exe_file, new_exe_file); if (old_exe_file) {
exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
} return 0;
}
/** * replace_mm_exe_file - replace a reference to the mm's executable file * @mm: The mm to change. * @new_exe_file: The new file to use. * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE).
*/ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
{ struct vm_area_struct *vma; struct file *old_exe_file; int ret = 0;
/* Forbid mm->exe_file change if old file still mapped. */
old_exe_file = get_mm_exe_file(mm); if (old_exe_file) {
VMA_ITERATOR(vmi, mm, 0);
mmap_read_lock(mm);
for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (path_equal(&vma->vm_file->f_path,
&old_exe_file->f_path)) {
ret = -EBUSY; break;
}
}
mmap_read_unlock(mm);
fput(old_exe_file); if (ret) return ret;
}
ret = exe_file_deny_write_access(new_exe_file); if (ret) return -EACCES;
get_file(new_exe_file);
/* set the new file */
mmap_write_lock(mm);
old_exe_file = rcu_dereference_raw(mm->exe_file);
rcu_assign_pointer(mm->exe_file, new_exe_file);
mmap_write_unlock(mm);
if (old_exe_file) {
exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
} return 0;
}
/** * get_mm_exe_file - acquire a reference to the mm's executable file * @mm: The mm of interest. * * Returns %NULL if mm has no associated executable file. * User must release file via fput().
*/ struct file *get_mm_exe_file(struct mm_struct *mm)
{ struct file *exe_file;
/** * get_task_exe_file - acquire a reference to the task's executable file * @task: The task. * * Returns %NULL if task's mm (if any) has no associated executable file or * this is a kernel thread with borrowed mm (see the comment above get_task_mm). * User must release file via fput().
*/ struct file *get_task_exe_file(struct task_struct *task)
{ struct file *exe_file = NULL; struct mm_struct *mm;
if (task->flags & PF_KTHREAD) return NULL;
task_lock(task);
mm = task->mm; if (mm)
exe_file = get_mm_exe_file(mm);
task_unlock(task); return exe_file;
}
/** * get_task_mm - acquire a reference to the task's mm * @task: The task. * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning * this kernel workthread has transiently adopted a user mm with use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() * after use. Typically used by /proc and ptrace.
*/ struct mm_struct *get_task_mm(struct task_struct *task)
{ struct mm_struct *mm;
if (task->flags & PF_KTHREAD) return NULL;
task_lock(task);
mm = task->mm; if (mm)
mmget(mm);
task_unlock(task); return mm;
}
EXPORT_SYMBOL_GPL(get_task_mm);
staticbool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsignedint mode)
{ if (mm == current->mm) returntrue; if (ptrace_may_access(task, mode)) returntrue; if ((mode & PTRACE_MODE_READ) && perfmon_capable()) returntrue; returnfalse;
}
if (killed) {
task_lock(child);
child->vfork_done = NULL;
task_unlock(child);
}
put_task_struct(child); return killed;
}
/* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. * * mm_release is called after a mm_struct has been removed * from the current process. * * This difference is important for error handling, when we * only half set up a mm_struct for a new process and need to restore * the old one. Because we mmput the new mm_struct before * restoring the old one. . . * Eric Biederman 10 January 1998
*/ staticvoid mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
uprobe_free_utask(tsk);
/* Get rid of any cached register state */
deactivate_mm(tsk, mm);
/* * Signal userspace if we're not exiting with a core dump * because we want to leave the value intact for debugging * purposes.
*/ if (tsk->clear_child_tid) { if (atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck.
*/
put_user(0, tsk->clear_child_tid);
do_futex(tsk->clear_child_tid, FUTEX_WAKE,
1, NULL, NULL, 0, 0);
}
tsk->clear_child_tid = NULL;
}
/* * All done, finally we can wake up parent and return this mm to him. * Also kthread_stop() uses this completion for synchronization.
*/ if (tsk->vfork_done)
complete_vfork_done(tsk);
}
/** * dup_mm() - duplicates an existing mm structure * @tsk: the task_struct with which the new mm will be associated. * @oldmm: the mm to duplicate. * * Allocates a new mm structure and duplicates the provided @oldmm structure * content into it. * * Return: the duplicated mm or NULL on failure.
*/ staticstruct mm_struct *dup_mm(struct task_struct *tsk, struct mm_struct *oldmm)
{ struct mm_struct *mm; int err;
mm = allocate_mm(); if (!mm) goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem;
uprobe_start_dup_mmap();
err = dup_mmap(mm, oldmm); if (err) goto free_pt;
uprobe_end_dup_mmap();
/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ if (clone_flags & CLONE_CLEAR_SIGHAND)
flush_signal_handlers(tsk, 0);
return 0;
}
void __cleanup_sighand(struct sighand_struct *sighand)
{ if (refcount_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand); /* * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand().
*/
kmem_cache_free(sighand_cachep, sighand);
}
}
staticvoid copy_seccomp(struct task_struct *p)
{ #ifdef CONFIG_SECCOMP /* * Must be called with sighand->lock held, which is common to * all threads in the group. Holding cred_guard_mutex is not * needed because this new task is not yet running and cannot * be racing exec.
*/
assert_spin_locked(¤t->sighand->siglock);
/* Ref-count the new filter user, and assign it. */
get_seccomp_filter(current);
p->seccomp = current->seccomp;
/* * Explicitly enable no_new_privs here in case it got set * between the task_struct being duplicated and holding the * sighand lock. The seccomp state and nnp must be in sync.
*/ if (task_no_new_privs(current))
task_set_no_new_privs(p);
/* * If the parent gained a seccomp mode after copying thread * flags and between before we held the sighand lock, we have * to manually enable the seccomp thread flag here.
*/ if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
set_task_syscall_work(p, SECCOMP); #endif
}
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
current->clear_child_tid = tidptr;
/** * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * * The helper verifies that @pid is still in use, without PIDFD_THREAD the * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in * order to install the pidfd into its file descriptor table or they must use * put_unused_fd() and fput() on the returned pidfd and pidfd file * respectively. * * This function is useful when a pidfd must already be reserved but there * might still be points of failure afterwards and the caller wants to ensure * that no pidfd is leaked into its file descriptor table. * * Return: On success, a reserved pidfd is returned from the function and a new * pidfd file is returned in the last argument to the function. On * error, a negative error code is returned from the function and the * last argument remains unchanged.
*/ int pidfd_prepare(struct pid *pid, unsignedint flags, struct file **ret_file)
{ struct file *pidfs_file;
/* * PIDFD_STALE is only allowed to be passed if the caller knows * that @pid is already registered in pidfs and thus * PIDFD_INFO_EXIT information is guaranteed to be available.
*/ if (!(flags & PIDFD_STALE)) { /* * While holding the pidfd waitqueue lock removing the * task linkage for the thread-group leader pid * (PIDTYPE_TGID) isn't possible. Thus, if there's still * task linkage for PIDTYPE_PID not having thread-group * leader linkage for the pid means it wasn't a * thread-group leader in the first place.
*/
guard(spinlock_irq)(&pid->wait_pidfd.lock);
/* Task has already been reaped. */ if (!pid_has_task(pid, PIDTYPE_PID)) return -ESRCH; /* * If this struct pid isn't used as a thread-group * leader but the caller requested to create a * thread-group leader pidfd then report ENOENT.
*/ if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) return -ENOENT;
}
CLASS(get_unused_fd, pidfd)(O_CLOEXEC); if (pidfd < 0) return pidfd;
pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); if (IS_ERR(pidfs_file)) return PTR_ERR(pidfs_file);
staticvoid copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk)
{ /* Skip if kernel thread */ if (!tsk->mm) return;
/* Skip if spawning a thread or using vfork */ if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) return;
/* We need to synchronize with __set_oom_adj */
mutex_lock(&oom_adj_mutex);
set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); /* Update the values in case they were changed after copy_signal */
tsk->signal->oom_score_adj = current->signal->oom_score_adj;
tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min;
mutex_unlock(&oom_adj_mutex);
}
#ifdef CONFIG_RV staticvoid rv_task_fork(struct task_struct *p)
{
memset(&p->rv, 0, sizeof(p->rv));
} #else #define rv_task_fork(p) do {} while (0) #endif
/* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller.
*/
__latent_entropy struct task_struct *copy_process( struct pid *pid, int trace, int node, struct kernel_clone_args *args)
{ int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; struct file *pidfile = NULL; const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy;
/* * Don't allow sharing the root directory with processes in a different * namespace
*/ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL);
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL);
/* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group.
*/ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL);
/* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code.
*/ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL);
/* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings.
*/ if ((clone_flags & CLONE_PARENT) &&
current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL);
/* * If the new process will be in a different pid or user namespace * do not allow it to share a thread group with the forking task.
*/ if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
(task_active_pid_ns(current) != nsp->pid_ns_for_children)) return ERR_PTR(-EINVAL);
}
if (clone_flags & CLONE_PIDFD) { /* * - CLONE_DETACHED is blocked so that we can potentially * reuse it later for CLONE_PIDFD.
*/ if (clone_flags & CLONE_DETACHED) return ERR_PTR(-EINVAL);
}
/* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple * processes that happen during the fork and delay them so that * they appear to happen after the fork.
*/
sigemptyset(&delayed.signal);
INIT_HLIST_NODE(&delayed.node);
spin_lock_irq(¤t->sighand->siglock); if (!(clone_flags & CLONE_THREAD))
hlist_add_head(&delayed.node, ¤t->signal->multiprocess);
recalc_sigpending();
spin_unlock_irq(¤t->sighand->siglock);
retval = -ERESTARTNOINTR; if (task_sigpending(current)) goto fork_out;
retval = -ENOMEM;
p = dup_task_struct(current, node); if (!p) goto fork_out;
p->flags &= ~PF_KTHREAD; if (args->kthread)
p->flags |= PF_KTHREAD; if (args->user_worker) { /* * Mark us a user worker, and block any signal that isn't * fatal or STOP
*/
p->flags |= PF_USER_WORKER;
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
} if (args->io_thread)
p->flags |= PF_IO_WORKER;
if (args->name)
strscpy_pad(p->comm, args->name, sizeof(p->comm));
/* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs.
*/
retval = -EAGAIN; if (data_race(nr_threads >= max_threads)) goto bad_fork_cleanup_count;
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p); if (retval) goto bad_fork_cleanup_policy;
retval = perf_event_init_task(p, clone_flags); if (retval) goto bad_fork_sched_cancel_fork;
retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; /* copy all the process information */
shm_init_task(p);
retval = security_task_alloc(p, clone_flags); if (retval) goto bad_fork_cleanup_audit;
retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_security;
retval = copy_files(clone_flags, p, args->no_files); if (retval) goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p); if (retval) goto bad_fork_cleanup_files;
retval = copy_sighand(clone_flags, p); if (retval) goto bad_fork_cleanup_fs;
retval = copy_signal(clone_flags, p); if (retval) goto bad_fork_cleanup_sighand;
retval = copy_mm(clone_flags, p); if (retval) goto bad_fork_cleanup_signal;
retval = copy_namespaces(clone_flags, p); if (retval) goto bad_fork_cleanup_mm;
retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces;
retval = copy_thread(p, args); if (retval) goto bad_fork_cleanup_io;
stackleak_task_init(p);
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
args->set_tid_size); if (IS_ERR(pid)) {
retval = PTR_ERR(pid); goto bad_fork_cleanup_thread;
}
}
/* * This has to happen after we've potentially unshared the file * descriptor table (so that the pidfd doesn't leak into the child * if the fd table isn't shared).
*/ if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0;
/* * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD.
*/
retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid;
pidfd = retval;
retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd;
}
/* * sigaltstack should be cleared when sharing the same VM
*/ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
sas_ss_reset(p);
/* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE.
*/
user_disable_single_step(p);
clear_task_syscall_work(p, SYSCALL_TRACE); #ifdefined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU)
clear_task_syscall_work(p, SYSCALL_EMU); #endif
clear_tsk_latency_tracing(p);
/* ok, now we should be set up.. */
p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) {
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
p->group_leader = p;
p->tgid = p->pid;
}
/* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress.
*/
retval = cgroup_can_fork(p, args); if (retval) goto bad_fork_put_pidfd;
/* * Now that the cgroups are pinned, re-clone the parent cgroup and put * the new task on the correct runqueue. All this *before* the task * becomes visible. * * This isn't part of ->can_fork() because while the re-cloning is * cgroup specific, it unconditionally needs to place the task on a * runqueue.
*/
retval = sched_cgroup_fork(p, args); if (retval) goto bad_fork_cancel_cgroup;
/* * Allocate a default futex hash for the user process once the first * thread spawns.
*/ if (need_futex_hash_allocate_default(clone_flags)) {
retval = futex_hash_allocate_default(); if (retval) goto bad_fork_cancel_cgroup; /* * If we fail beyond this point we don't free the allocated * futex hash map. We assume that another thread will be created * and makes use of it. The hash map will be freed once the main * thread terminates.
*/
} /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by * stalling fork(2) after we recorded the start_time but before it is * visible to the system.
*/
/* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling!
*/
write_lock_irq(&tasklist_lock);
/* * This is like kernel_clone(), but shaved down and tailored to just * creating io_uring workers. It returns a created task, or an error pointer. * The returned task is inactive, and the caller must fire it up through * wake_up_new_task(p). All signals are blocked in the created task.
*/ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node)
{ unsignedlong flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|
CLONE_IO; struct kernel_clone_args args = {
.flags = ((lower_32_bits(flags) | CLONE_VM |
CLONE_UNTRACED) & ~CSIGNAL),
.exit_signal = (lower_32_bits(flags) & CSIGNAL),
.fn = fn,
.fn_arg = arg,
.io_thread = 1,
.user_worker = 1,
};
return copy_process(NULL, 0, node, &args);
}
/* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. * * args->exit_signal is expected to be checked for sanity by the caller.
*/
pid_t kernel_clone(struct kernel_clone_args *args)
{
u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; int trace = 0;
pid_t nr;
/* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate * field in struct clone_args and it still doesn't make sense to have * them both point at the same memory location. Performing this check * here has the advantage that we don't need to have a separate helper * to check for legacy clone().
*/ if ((clone_flags & CLONE_PIDFD) &&
(clone_flags & CLONE_PARENT_SETTID) &&
(args->pidfd == args->parent_tid)) return -EINVAL;
/* * Determine whether and which event to report to ptracer. When
--> --------------------
--> maximum size reached
--> --------------------
¤ Dauer der Verarbeitung: 0.86Bemerkung:
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.