/* * Returns true if current's euid is same as p's uid or euid, * or has CAP_SYS_NICE to p's user_ns. * * Called with rcu_read_lock, creds are safe
*/ staticbool set_one_prio_perm(struct task_struct *p)
{ conststruct cred *cred = current_cred(), *pcred = __task_cred(p);
if (uid_eq(pcred->uid, cred->euid) ||
uid_eq(pcred->euid, cred->euid)) returntrue; if (ns_capable(pcred->user_ns, CAP_SYS_NICE)) returntrue; returnfalse;
}
/* * set the priority of a task * - the caller must hold the RCU read lock
*/ staticint set_one_prio(struct task_struct *p, int niceval, int error)
{ int no_nice;
if (which > PRIO_USER || which < PRIO_PROCESS) goto out;
/* normalize: avoid signed division (rounding problems) */
error = -ESRCH; if (niceval < MIN_NICE)
niceval = MIN_NICE; if (niceval > MAX_NICE)
niceval = MAX_NICE;
rcu_read_lock(); switch (which) { case PRIO_PROCESS: if (who)
p = find_task_by_vpid(who); else
p = current; if (p)
error = set_one_prio(p, niceval, error); break; case PRIO_PGRP: if (who)
pgrp = find_vpid(who); else
pgrp = task_pgrp(current);
read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
error = set_one_prio(p, niceval, error);
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
read_unlock(&tasklist_lock); break; case PRIO_USER:
uid = make_kuid(cred->user_ns, who);
user = cred->user; if (!who)
uid = cred->uid; elseif (!uid_eq(uid, cred->uid)) {
user = find_user(uid); if (!user) goto out_unlock; /* No processes for this user */
}
for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
error = set_one_prio(p, niceval, error);
} if (!uid_eq(uid, cred->uid))
free_uid(user); /* For find_user() */ break;
}
out_unlock:
rcu_read_unlock();
out: return error;
}
/* * Ugh. To avoid negative return values, "getpriority()" will * not return the normal nice-value, but a negated value that * has been offset by 20 (ie it returns 40..1 instead of -20..19) * to stay compatible.
*/
SYSCALL_DEFINE2(getpriority, int, which, int, who)
{ struct task_struct *g, *p; struct user_struct *user; conststruct cred *cred = current_cred(); long niceval, retval = -ESRCH; struct pid *pgrp;
kuid_t uid;
if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL;
rcu_read_lock(); switch (which) { case PRIO_PROCESS: if (who)
p = find_task_by_vpid(who); else
p = current; if (p) {
niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval)
retval = niceval;
} break; case PRIO_PGRP: if (who)
pgrp = find_vpid(who); else
pgrp = task_pgrp(current);
read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval)
retval = niceval;
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
read_unlock(&tasklist_lock); break; case PRIO_USER:
uid = make_kuid(cred->user_ns, who);
user = cred->user; if (!who)
uid = cred->uid; elseif (!uid_eq(uid, cred->uid)) {
user = find_user(uid); if (!user) goto out_unlock; /* No processes for this user */
}
for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval)
retval = niceval;
}
} if (!uid_eq(uid, cred->uid))
free_uid(user); /* for find_user() */ break;
}
out_unlock:
rcu_read_unlock();
return retval;
}
/* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) * * If you set the real gid at all, or set the effective gid to a value not * equal to the real gid, then the saved gid is set to the new effective gid. * * This makes it possible for a setgid program to completely drop its * privileges, which is often a useful assertion to make when you are doing * a security audit over a program. * * The general idea is that a program which uses just setregid() will be * 100% compatible with BSD. A program which uses just setgid() will be * 100% compatible with POSIX with saved IDs. * * SMP: There are not races, the GIDs are checked only by filesystem * operations (as far as semantic preservation is concerned).
*/ #ifdef CONFIG_MULTIUSER long __sys_setregid(gid_t rgid, gid_t egid)
{ struct user_namespace *ns = current_user_ns(); conststruct cred *old; struct cred *new; int retval;
kgid_t krgid, kegid;
staticvoid flag_nproc_exceeded(struct cred *new)
{ if (new->ucounts == current_ucounts()) return;
/* * We don't fail in case of NPROC limit excess here because too many * poorly written programs don't check set*uid() return code, assuming * it never fails if called by root. We may still enforce NPROC limit * for programs doing set*uid()+execve() by harmlessly deferring the * failure to the execve() stage.
*/ if (is_rlimit_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
new->user != INIT_USER)
current->flags |= PF_NPROC_EXCEEDED; else
current->flags &= ~PF_NPROC_EXCEEDED;
}
/* * Unprivileged users may change the real uid to the effective uid * or vice versa. (BSD-style) * * If you set the real uid at all, or set the effective uid to a value not * equal to the real uid, then the saved uid is set to the new effective uid. * * This makes it possible for a setuid program to completely drop its * privileges, which is often a useful assertion to make when you are doing * a security audit over a program. * * The general idea is that a program which uses just setreuid() will be * 100% compatible with BSD. A program which uses just setuid() will be * 100% compatible with POSIX with saved IDs.
*/ long __sys_setreuid(uid_t ruid, uid_t euid)
{ struct user_namespace *ns = current_user_ns(); conststruct cred *old; struct cred *new; int retval;
kuid_t kruid, keuid;
/* * setuid() is implemented like SysV with SAVED_IDS * * Note that SAVED_ID's is deficient in that a setuid root program * like sendmail, for example, cannot set its uid to be a normal * user and then switch back, because if you're root, setuid() sets * the saved uid too. If you don't like this, blame the bright people * in the POSIX committee and/or USG. Note that the BSD-style setreuid() * will allow a root program to temporarily drop privileges and be able to * regain them by swapping the real and effective uid.
*/ long __sys_setuid(uid_t uid)
{ struct user_namespace *ns = current_user_ns(); conststruct cred *old; struct cred *new; int retval;
kuid_t kuid;
kuid = make_kuid(ns, uid); if (!uid_valid(kuid)) return -EINVAL;
new = prepare_creds(); if (!new) return -ENOMEM;
old = current_cred();
/* * This function implements a generic ability to update ruid, euid, * and suid. This allows you to implement the 4.4 compatible seteuid().
*/ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
{ struct user_namespace *ns = current_user_ns(); conststruct cred *old; struct cred *new; int retval;
kuid_t kruid, keuid, ksuid; bool ruid_new, euid_new, suid_new;
retval = put_user(rgid, rgidp); if (!retval) {
retval = put_user(egid, egidp); if (!retval)
retval = put_user(sgid, sgidp);
}
return retval;
}
/* * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This * is used for "access()" and for the NFS daemon (letting nfsd stay at * whatever uid it wants to). It normally shadows "euid", except when * explicitly set by setfsuid() or for access..
*/ long __sys_setfsuid(uid_t uid)
{ conststruct cred *old; struct cred *new;
uid_t old_fsuid;
kuid_t kuid;
old = current_cred();
old_fsuid = from_kuid_munged(old->user_ns, old->fsuid);
kuid = make_kuid(old->user_ns, uid); if (!uid_valid(kuid)) return old_fsuid;
new = prepare_creds(); if (!new) return old_fsuid;
/** * sys_getpid - return the thread group id of the current process * * Note, despite the name, this returns the tgid not the pid. The tgid and * the pid are identical unless CLONE_THREAD was specified on clone() in * which case the tgid is the same in all threads of the same group. * * This is SMP safe as current->tgid does not change.
*/
SYSCALL_DEFINE0(getpid)
{ return task_tgid_vnr(current);
}
/* Thread ID - the internal kernel "pid" */
SYSCALL_DEFINE0(gettid)
{ return task_pid_vnr(current);
}
/* * Accessing ->real_parent is not SMP-safe, it could * change from under us. However, we can use a stale * value of ->real_parent under rcu_read_lock(), see * release_task()->call_rcu(delayed_put_task_struct).
*/
SYSCALL_DEFINE0(getppid)
{ int pid;
/* * This needs some heavy checking ... * I just haven't the stomach for it. I also don't fully * understand sessions/pgrp etc. Let somebody who does explain it. * * OK, I think I have the protection semantics right.... this is really * only important on a multi-user system anyway, to make sure one user * can't send a signal to a process owned by another. -TYT, 12/12/91 * * !PF_FORKNOEXEC check to conform completely to POSIX.
*/
SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
{ struct task_struct *p; struct task_struct *group_leader = current->group_leader; struct pid *pids[PIDTYPE_MAX] = { 0 }; struct pid *pgrp; int err;
if (!pid)
pid = task_pid_vnr(group_leader); if (!pgid)
pgid = pid; if (pgid < 0) return -EINVAL;
rcu_read_lock();
/* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM
*/
write_lock_irq(&tasklist_lock);
err = -ESRCH;
p = find_task_by_vpid(pid); if (!p) goto out;
err = -EINVAL; if (!thread_group_leader(p)) goto out;
if (same_thread_group(p->real_parent, group_leader)) {
err = -EPERM; if (task_session(p) != task_session(group_leader)) goto out;
err = -EACCES; if (!(p->flags & PF_FORKNOEXEC)) goto out;
} else {
err = -ESRCH; if (p != group_leader) goto out;
}
pgrp = find_vpid(pgid);
g = pid_task(pgrp, PIDTYPE_PGID); if (!g || task_session(g) != task_session(group_leader)) goto out;
}
err = security_task_setpgid(p, pgid); if (err) goto out;
if (task_pgrp(p) != pgrp)
change_pid(pids, p, PIDTYPE_PGID, pgrp);
err = 0;
out: /* All paths lead to here, thus we are safe. -DaveM */
write_unlock_irq(&tasklist_lock);
rcu_read_unlock();
free_pids(pids); return err;
}
/* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 * And we map 4.x and later versions to 2.6.60+x, so 4.0/5.0/6.0/... would be * 2.6.60.
*/ staticint override_release(char __user *release, size_t len)
{ int ret = 0;
if (len < 0) return -EINVAL;
down_read(&uts_sem);
u = utsname();
i = 1 + strlen(u->nodename); if (i > len)
i = len;
memcpy(tmp, u->nodename, i);
up_read(&uts_sem); if (copy_to_user(name, tmp, i)) return -EFAULT; return 0;
}
#endif
/* * Only setdomainname; getdomainname can be implemented by calling * uname()
*/
SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
{ int errno; char tmp[__NEW_UTS_LEN];
if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; if (len < 0 || len > __NEW_UTS_LEN) return -EINVAL;
/* make sure you are allowed to change @tsk limits before calling this */ staticint do_prlimit(struct task_struct *tsk, unsignedint resource, struct rlimit *new_rlim, struct rlimit *old_rlim)
{ struct rlimit *rlim; int retval = 0;
if (resource >= RLIM_NLIMITS) return -EINVAL;
resource = array_index_nospec(resource, RLIM_NLIMITS);
if (new_rlim) { if (new_rlim->rlim_cur > new_rlim->rlim_max) return -EINVAL; if (resource == RLIMIT_NOFILE &&
new_rlim->rlim_max > sysctl_nr_open) return -EPERM;
}
/* Holding a refcount on tsk protects tsk->signal from disappearing. */
rlim = tsk->signal->rlim + resource;
task_lock(tsk->group_leader); if (new_rlim) { /* * Keep the capable check against init_user_ns until cgroups can * contain all limits.
*/ if (new_rlim->rlim_max > rlim->rlim_max &&
!capable(CAP_SYS_RESOURCE))
retval = -EPERM; if (!retval)
retval = security_task_setrlimit(tsk, resource, new_rlim);
} if (!retval) { if (old_rlim)
*old_rlim = *rlim; if (new_rlim)
*rlim = *new_rlim;
}
task_unlock(tsk->group_leader);
/* * RLIMIT_CPU handling. Arm the posix CPU timer if the limit is not * infinite. In case of RLIM_INFINITY the posix CPU timer code * ignores the rlimit.
*/ if (!retval && new_rlim && resource == RLIMIT_CPU &&
new_rlim->rlim_cur != RLIM_INFINITY &&
IS_ENABLED(CONFIG_POSIX_TIMERS)) { /* * update_rlimit_cpu can fail if the task is exiting, but there * may be other tasks in the thread group that are not exiting, * and they need their cpu timers adjusted. * * The group_leader is the last task to be released, so if we * cannot update_rlimit_cpu on it, then the entire process is * exiting and we do not need to update at all.
*/
update_rlimit_cpu(tsk->group_leader, new_rlim->rlim_cur);
}
if (new_rlim) { if (copy_from_user(&new64, new_rlim, sizeof(new64))) return -EFAULT;
rlim64_to_rlim(&new64, &new);
checkflags |= LSM_PRLIMIT_WRITE;
}
rcu_read_lock();
tsk = pid ? find_task_by_vpid(pid) : current; if (!tsk) {
rcu_read_unlock(); return -ESRCH;
}
ret = check_prlimit_permission(tsk, checkflags); if (ret) {
rcu_read_unlock(); return ret;
}
get_task_struct(tsk);
rcu_read_unlock();
need_tasklist = !same_thread_group(tsk, current); if (need_tasklist) { /* * Ensure we can't race with group exit or de_thread(), * so tsk->group_leader can't be freed or changed until * read_unlock(tasklist_lock) below.
*/
read_lock(&tasklist_lock); if (!pid_alive(tsk))
ret = -ESRCH;
}
if (!ret) {
ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL,
old_rlim ? &old : NULL);
}
if (need_tasklist)
read_unlock(&tasklist_lock);
if (!ret && old_rlim) {
rlim_to_rlim64(&old, &old64); if (copy_to_user(old_rlim, &old64, sizeof(old64)))
ret = -EFAULT;
}
/* * It would make sense to put struct rusage in the task_struct, * except that would make the task_struct be *really big*. After * task_struct gets moved into malloc'ed memory, it would * make sense to do this. It will make moving the rest of the information * a lot simpler! (Which we're not doing right now because we're not * measuring them yet). * * When sampling multiple threads for RUSAGE_SELF, under SMP we might have * races with threads incrementing their own counters. But since word * reads are atomic, we either get new values or old values and we don't * care which for the sums. We always take the siglock to protect reading * the c* fields from p->signal from races with exit.c updating those * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. * * Locking: * We need to take the siglock for CHILDEREN, SELF and BOTH * for the cases current multithreaded, non-current single threaded * non-current multithreaded. Thread traversal is now safe with * the siglock held. * Strictly speaking, we donot need to take the siglock if we are current and * single threaded, as no one else can take our signal_struct away, no one * else can reap the children to update signal->c* counters, and no one else * can race with the signal-> fields. If we do not take any lock, the * signal-> fields could be read out of order while another thread was just * exiting. So we should place a read memory barrier when we avoid the lock. * On the writer side, write memory barrier is implied in __exit_signal * as __exit_signal releases the siglock spinlock after updating the signal-> * fields. But we don't do this yet to keep things simple. *
*/
/* * Because the original mm->exe_file points to executable file, make * sure that this one is executable as well, to avoid breaking an * overall picture.
*/ if (!S_ISREG(inode->i_mode) || path_noexec(&fd_file(exe)->f_path)) return -EACCES;
err = file_permission(fd_file(exe), MAY_EXEC); if (err) return err;
return replace_mm_exe_file(mm, fd_file(exe));
}
/* * Check arithmetic relations of passed addresses. * * WARNING: we don't require any capability here so be very careful * in what is allowed for modification from userspace.
*/ staticint validate_prctl_map_addr(struct prctl_mm_map *prctl_map)
{ unsignedlong mmap_max_addr = TASK_SIZE; int error = -EINVAL, i;
/* * Make sure the members are not somewhere outside * of allowed address space.
*/ for (i = 0; i < ARRAY_SIZE(offsets); i++) {
u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
/* * Neither we should allow to override limits if they set.
*/ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
prctl_map->start_brk, prctl_map->end_data,
prctl_map->start_data)) goto out;
if (opt == PR_SET_MM_MAP_SIZE) return put_user((unsignedint)sizeof(prctl_map),
(unsignedint __user *)addr);
if (data_size != sizeof(prctl_map)) return -EINVAL;
if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) return -EFAULT;
error = validate_prctl_map_addr(&prctl_map); if (error) return error;
if (prctl_map.auxv_size) { /* * Someone is trying to cheat the auxv vector.
*/ if (!prctl_map.auxv ||
prctl_map.auxv_size > sizeof(mm->saved_auxv)) return -EINVAL;
/* Last entry must be AT_NULL as specification requires */
user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
}
if (prctl_map.exe_fd != (u32)-1) { /* * Check if the current user is checkpoint/restore capable. * At the time of this writing, it checks for CAP_SYS_ADMIN * or CAP_CHECKPOINT_RESTORE. * Note that a user with access to ptrace can masquerade an * arbitrary program as any executable, even setuid ones. * This may have implications in the tomoyo subsystem.
*/ if (!checkpoint_restore_ns_capable(current_user_ns())) return -EPERM;
error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); if (error) return error;
}
/* * arg_lock protects concurrent updates but we still need mmap_lock for * read to exclude races with sys_brk.
*/
mmap_read_lock(mm);
/* * We don't validate if these members are pointing to * real present VMAs because application may have correspond * VMAs already unmapped and kernel uses these members for statistics * output in procfs mostly, except * * - @start_brk/@brk which are used in do_brk_flags but kernel lookups * for VMAs when updating these members so anything wrong written * here cause kernel to swear at userspace program but won't lead * to any problem in kernel itself
*/
/* * Note this update of @saved_auxv is lockless thus * if someone reads this member in procfs while we're * updating -- it may get partly updated results. It's * known and acceptable trade off: we leave it as is to * not introduce additional locks here making the kernel * more complex.
*/ if (prctl_map.auxv_size)
memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
staticint prctl_set_auxv(struct mm_struct *mm, unsignedlong addr, unsignedlong len)
{ /* * This doesn't move the auxiliary vector itself since it's pinned to * mm_struct, but it permits filling the vector with new values. It's * up to the caller to provide sane values here, otherwise userspace * tools which use this vector might be unhappy.
*/ unsignedlong user_auxv[AT_VECTOR_SIZE] = {};
if (len > sizeof(user_auxv)) return -EINVAL;
if (copy_from_user(user_auxv, (constvoid __user *)addr, len)) return -EFAULT;
/* Make sure the last entry is always AT_NULL */
user_auxv[AT_VECTOR_SIZE - 2] = 0;
user_auxv[AT_VECTOR_SIZE - 1] = 0;
if (opt == PR_SET_MM_EXE_FILE) return prctl_set_mm_exe_file(mm, (unsignedint)addr);
if (opt == PR_SET_MM_AUXV) return prctl_set_auxv(mm, addr, arg4);
if (addr >= TASK_SIZE || addr < mmap_min_addr) return -EINVAL;
error = -EINVAL;
/* * arg_lock protects concurrent updates of arg boundaries, we need * mmap_lock for a) concurrent sys_brk, b) finding VMA for addr * validation.
*/
mmap_read_lock(mm);
vma = find_vma(mm, addr);
switch (opt) { case PR_SET_MM_START_CODE:
prctl_map.start_code = addr; break; case PR_SET_MM_END_CODE:
prctl_map.end_code = addr; break; case PR_SET_MM_START_DATA:
prctl_map.start_data = addr; break; case PR_SET_MM_END_DATA:
prctl_map.end_data = addr; break; case PR_SET_MM_START_STACK:
prctl_map.start_stack = addr; break; case PR_SET_MM_START_BRK:
prctl_map.start_brk = addr; break; case PR_SET_MM_BRK:
prctl_map.brk = addr; break; case PR_SET_MM_ARG_START:
prctl_map.arg_start = addr; break; case PR_SET_MM_ARG_END:
prctl_map.arg_end = addr; break; case PR_SET_MM_ENV_START:
prctl_map.env_start = addr; break; case PR_SET_MM_ENV_END:
prctl_map.env_end = addr; break; default: goto out;
}
error = validate_prctl_map_addr(&prctl_map); if (error) goto out;
switch (opt) { /* * If command line arguments and environment * are placed somewhere else on stack, we can * set them up here, ARG_START/END to setup * command line arguments and ENV_START/END * for environment.
*/ case PR_SET_MM_START_STACK: case PR_SET_MM_ARG_START: case PR_SET_MM_ARG_END: case PR_SET_MM_ENV_START: case PR_SET_MM_ENV_END: if (!vma) {
error = -EFAULT; goto out;
}
}
staticint propagate_has_child_subreaper(struct task_struct *p, void *data)
{ /* * If task has has_child_subreaper - all its descendants * already have these flag too and new descendants will * inherit it on fork, skip them. * * If we've found child_reaper - skip descendants in * it's subtree as they will never get out pidns.
*/ if (p->signal->has_child_subreaper ||
is_child_reaper(task_pid(p))) return 0;
staticinlineunsignedlong get_current_mdwe(void)
{ unsignedlong ret = 0;
if (test_bit(MMF_HAS_MDWE, ¤t->mm->flags))
ret |= PR_MDWE_REFUSE_EXEC_GAIN; if (test_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags))
ret |= PR_MDWE_NO_INHERIT;
if (bits & ~(PR_MDWE_REFUSE_EXEC_GAIN | PR_MDWE_NO_INHERIT)) return -EINVAL;
/* NO_INHERIT only makes sense with REFUSE_EXEC_GAIN */ if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN)) return -EINVAL;
/* * EOPNOTSUPP might be more appropriate here in principle, but * existing userspace depends on EINVAL specifically.
*/ if (!arch_memory_deny_write_exec_supported()) return -EINVAL;
current_bits = get_current_mdwe(); if (current_bits && current_bits != bits) return -EPERM; /* Cannot unset the flags */
if (bits & PR_MDWE_NO_INHERIT)
set_bit(MMF_HAS_MDWE_NO_INHERIT, ¤t->mm->flags); if (bits & PR_MDWE_REFUSE_EXEC_GAIN)
set_bit(MMF_HAS_MDWE, ¤t->mm->flags);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung ist noch experimentell.