/* * Stashes information that userspace needs to access even after the * process has been reaped.
*/ struct pidfs_exit_info {
__u64 cgroupid;
__s32 exit_code;
__u32 coredump_mask;
};
/* * On 64 bit nothing special happens. The 64bit number assigned * to struct pid is the inode number. * * On 32 bit the 64 bit number assigned to struct pid is split * into two 32 bit numbers. The lower 32 bits are used as the * inode number and the upper 32 bits are used as the inode * generation number. * * On 32 bit pidfs_ino() will return the lower 32 bit. When * pidfs_ino() returns zero a wrap around happened. When a * wraparound happens the 64 bit number will be incremented by 2 * so inode numbering starts at 2 again. * * On 64 bit comparing two pidfds is as simple as comparing * inode numbers. * * When a wraparound happens on 32 bit multiple pidfds with the * same inode number are likely to exist (This isn't a problem * since before pidfs pidfds used the anonymous inode meaning * all pidfds had the same inode number.). Userspace can * reconstruct the 64 bit identifier by retrieving both the * inode number and the inode generation number to compare or * use file handles.
*/ if (pidfs_ino(pidfs_ino_nr) == 0)
pidfs_ino_nr += 2;
/* * Any dentry must've been wiped from the pid by now. * Otherwise there's a reference count bug.
*/
VFS_WARN_ON_ONCE(pid->stashed);
/* * This if an error occurred during e.g., task creation that * causes us to never go through the exit path.
*/ if (unlikely(!attr)) return;
/* This never had a pidfd created. */ if (IS_ERR(attr)) return;
xattrs = no_free_ptr(attr->xattrs); if (xattrs)
simple_xattrs_free(xattrs, NULL);
}
#ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd * @m: proc fdinfo file * @f: file referencing a pidfd * * Pid: * This function will print the pid that a given pidfd refers to in the * pid namespace of the procfs instance. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its pid. This is * similar to calling getppid() on a process whose parent is outside of * its pid namespace. * * NSpid: * If pid namespaces are supported then this function will also print * the pid of a given pidfd refers to for all descendant pid namespaces * starting from the current pid namespace of the instance, i.e. the * Pid field and the first entry in the NSpid field will be identical. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its first NSpid * entry and no others will be shown. * Note that this differs from the Pid and NSpid fields in * /proc/<pid>/status where Pid and NSpid are always shown relative to * the pid namespace of the procfs instance. The difference becomes * obvious when sending around a pidfd between pid namespaces from a * different branch of the tree, i.e. where no ancestral relation is * present between the pid namespaces: * - create two new pid namespaces ns1 and ns2 in the initial pid * namespace (also take care to create new mount namespaces in the * new pid namespace and mount procfs) * - create a process with a pidfd in ns1 * - send pidfd from ns1 to ns2 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid * have exactly one entry, which is 0
*/ staticvoid pidfd_show_fdinfo(struct seq_file *m, struct file *f)
{ struct pid *pid = pidfd_pid(f); struct pid_namespace *ns;
pid_t nr = -1;
if (likely(pid_has_task(pid, PIDTYPE_PID))) {
ns = proc_pid_ns(file_inode(m->file)->i_sb);
nr = pid_nr_ns(pid, ns);
}
seq_put_decimal_ll(m, "Pid:\t", nr);
#ifdef CONFIG_PID_NS
seq_put_decimal_ll(m, "\nNSpid:\t", nr); if (nr > 0) { int i;
/* If nr is non-zero it means that 'pid' is valid and that * ns, i.e. the pid namespace associated with the procfs * instance, is in the pid namespace hierarchy of pid. * Start at one below the already printed level.
*/ for (i = ns->level + 1; i <= pid->level; i++)
seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
} #endif
seq_putc(m, '\n');
} #endif
/* * Poll support for process exit notification.
*/ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
{ struct pid *pid = pidfd_pid(file); struct task_struct *task;
__poll_t poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts); /* * Don't wake waiters if the thread-group leader exited * prematurely. They either get notified when the last subthread * exits or not at all if one of the remaining subthreads execs * and assumes the struct pid of the old thread-group leader.
*/
guard(rcu)();
task = pid_task(pid, PIDTYPE_PID); if (!task)
poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; elseif (task->exit_state && !delay_group_leader(task))
poll_flags = EPOLLIN | EPOLLRDNORM;
task = get_pid_task(pid, PIDTYPE_PID); if (!task) { /* * If the task has already been reaped, only exit * information is available
*/ if (!(mask & PIDFD_INFO_EXIT)) return -ESRCH;
goto copy_out;
}
c = get_task_cred(task); if (!c) return -ESRCH;
if ((kinfo.mask & PIDFD_INFO_COREDUMP) && !(kinfo.coredump_mask)) {
task_lock(task); if (task->mm)
kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags);
task_unlock(task);
}
/* Unconditionally return identifiers and credentials, the rest only on request */
/* * Copy pid/tgid last, to reduce the chances the information might be * stale. Note that it is not possible to ensure it will be valid as the * task might return as soon as the copy_to_user finishes, but that's ok * and userspace expects that might happen and can act accordingly, so * this is just best-effort. What we can do however is checking that all * the fields are set correctly, or return ESRCH to avoid providing
* incomplete information. */
if (kinfo.pid == 0 || kinfo.tgid == 0) return -ESRCH;
copy_out: /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that * userspace knows about will be copied. If userspace provides a new * struct, only the bits that the kernel knows about will be copied.
*/ return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL);
}
staticbool pidfs_ioctl_valid(unsignedint cmd)
{ switch (cmd) { case FS_IOC_GETVERSION: case PIDFD_GET_CGROUP_NAMESPACE: case PIDFD_GET_IPC_NAMESPACE: case PIDFD_GET_MNT_NAMESPACE: case PIDFD_GET_NET_NAMESPACE: case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: case PIDFD_GET_TIME_NAMESPACE: case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: case PIDFD_GET_UTS_NAMESPACE: case PIDFD_GET_USER_NAMESPACE: case PIDFD_GET_PID_NAMESPACE: returntrue;
}
/* Extensible ioctls require some more careful checks. */ switch (_IOC_NR(cmd)) { case _IOC_NR(PIDFD_GET_INFO): /* * Try to prevent performing a pidfd ioctl when someone * erronously mistook the file descriptor for a pidfd. * This is not perfect but will catch most cases.
*/ return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0);
}
/* Extensible IOCTL that does not open namespace FDs, take a shortcut */ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) return pidfd_info(file, cmd, arg);
task = get_pid_task(pidfd_pid(file), PIDTYPE_PID); if (!task) return -ESRCH;
if (arg) return -EINVAL;
scoped_guard(task_lock, task) {
nsp = task->nsproxy; if (nsp)
get_nsproxy(nsp);
} if (!nsp) return -ESRCH; /* just pretend it didn't exist */
/* * We're trying to open a file descriptor to the namespace so perform a * filesystem cred ptrace check. Also, we mirror nsfs behavior.
*/ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) return -EACCES;
switch (cmd) { /* Namespaces that hang of nsproxy. */ case PIDFD_GET_CGROUP_NAMESPACE: if (IS_ENABLED(CONFIG_CGROUPS)) {
get_cgroup_ns(nsp->cgroup_ns);
ns_common = to_ns_common(nsp->cgroup_ns);
} break; case PIDFD_GET_IPC_NAMESPACE: if (IS_ENABLED(CONFIG_IPC_NS)) {
get_ipc_ns(nsp->ipc_ns);
ns_common = to_ns_common(nsp->ipc_ns);
} break; case PIDFD_GET_MNT_NAMESPACE:
get_mnt_ns(nsp->mnt_ns);
ns_common = to_ns_common(nsp->mnt_ns); break; case PIDFD_GET_NET_NAMESPACE: if (IS_ENABLED(CONFIG_NET_NS)) {
ns_common = to_ns_common(nsp->net_ns);
get_net_ns(ns_common);
} break; case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: if (IS_ENABLED(CONFIG_PID_NS)) {
get_pid_ns(nsp->pid_ns_for_children);
ns_common = to_ns_common(nsp->pid_ns_for_children);
} break; case PIDFD_GET_TIME_NAMESPACE: if (IS_ENABLED(CONFIG_TIME_NS)) {
get_time_ns(nsp->time_ns);
ns_common = to_ns_common(nsp->time_ns);
} break; case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: if (IS_ENABLED(CONFIG_TIME_NS)) {
get_time_ns(nsp->time_ns_for_children);
ns_common = to_ns_common(nsp->time_ns_for_children);
} break; case PIDFD_GET_UTS_NAMESPACE: if (IS_ENABLED(CONFIG_UTS_NS)) {
get_uts_ns(nsp->uts_ns);
ns_common = to_ns_common(nsp->uts_ns);
} break; /* Namespaces that don't hang of nsproxy. */ case PIDFD_GET_USER_NAMESPACE: if (IS_ENABLED(CONFIG_USER_NS)) {
rcu_read_lock();
ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns)));
rcu_read_unlock();
} break; case PIDFD_GET_PID_NAMESPACE: if (IS_ENABLED(CONFIG_PID_NS)) {
rcu_read_lock();
pid_ns = task_active_pid_ns(task); if (pid_ns)
ns_common = to_ns_common(get_pid_ns(pid_ns));
rcu_read_unlock();
} break; default: return -ENOIOCTLCMD;
}
if (!ns_common) return -EOPNOTSUPP;
/* open_namespace() unconditionally consumes the reference */ return open_namespace(ns_common);
}
/* * We're called from release_task(). We know there's at least one * reference to struct pid being held that won't be released until the * task has been reaped which cannot happen until we're out of * release_task(). * * If this struct pid has at least once been referred to by a pidfd then * pid->attr will be allocated. If not we mark the struct pid as dead so * anyone who is trying to register it with pidfs will fail to do so. * Otherwise we would hand out pidfs for reaped tasks without having * exit information available. * * Worst case is that we've filled in the info and the pid gets freed * right away in free_pid() when no one holds a pidfd anymore. Since * pidfs_exit() currently is placed after exit_task_work() we know that * it cannot be us aka the exiting task holding a pidfd to itself.
*/ void pidfs_exit(struct task_struct *tsk)
{ struct pid *pid = task_pid(tsk); struct pidfs_attr *attr; struct pidfs_exit_info *exit_info; #ifdef CONFIG_CGROUPS struct cgroup *cgrp; #endif
might_sleep();
guard(spinlock_irq)(&pid->wait_pidfd.lock);
attr = pid->attr; if (!attr) { /* * No one ever held a pidfd for this struct pid. * Mark it as dead so no one can add a pidfs * entry anymore. We're about to be reaped and * so no exit information would be available.
*/
pid->attr = PIDFS_PID_DEAD; return;
}
/* * If @pid->attr is set someone might still legitimately hold a * pidfd to @pid or someone might concurrently still be getting * a reference to an already stashed dentry from @pid->stashed. * So defer cleaning @pid->attr until the last reference to @pid * is put
*/
exit_info = &attr->__pei; /* Note how we were coredumped. */
coredump_mask = pidfs_coredump_mask(cprm->mm_flags); /* Note that we actually did coredump. */
coredump_mask |= PIDFD_COREDUMPED; /* If coredumping is set to skip we should never end up here. */
VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP);
smp_store_release(&exit_info->coredump_mask, coredump_mask);
} #endif
staticstruct vfsmount *pidfs_mnt __ro_after_init;
/* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean * permission concept for pidfds.
*/ staticint pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr)
{ return anon_inode_setattr(idmap, dentry, attr);
}
/* * 'lsof' has knowledge of out historical anon_inode use, and expects * the pidfs dentry name to start with 'anon_inode'.
*/ staticchar *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
{ return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]");
}
pid = pidfs_ino_get_pid(pid_ino); if (!pid) return NULL;
ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); if (ret < 0) return ERR_PTR(ret);
VFS_WARN_ON_ONCE(!pid->attr);
mntput(path.mnt); return path.dentry;
}
/* * Make sure that we reject any nonsensical flags that users pass via * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and * PIDFD_NONBLOCK as O_NONBLOCK.
*/ #define VALID_FILE_HANDLE_OPEN_FLAGS \
(O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL)
/* * pidfd_ino_get_pid() will verify that the struct pid is part * of the caller's pid namespace hierarchy. No further * permission checks are needed.
*/ return 0;
}
staticstruct file *pidfs_export_open(struct path *path, unsignedint oflags)
{ /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise * O_RDWR as pidfds always are.
*/
oflags &= ~O_LARGEFILE; return dentry_open(path, oflags | O_RDWR, current_cred());
}
/* * Ensure that PIDFD_STALE can be passed as a flag without * overloading other uapi pidfd flags.
*/
BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD);
BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK);
ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) return ERR_PTR(ret);
VFS_WARN_ON_ONCE(!pid->attr);
flags &= ~PIDFD_STALE;
flags |= O_RDWR;
pidfd_file = dentry_open(&path, flags, current_cred()); /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ if (!IS_ERR(pidfd_file))
pidfd_file->f_flags |= (flags & PIDFD_THREAD);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.