Quelle namei.c Sprache: C

// SPDX-License-Identifier: GPL-2.0
/*
*  linux/fs/namei.c
*
*  Copyright (C) 1991, 1992  Linus Torvalds
*/

/*
* Some corrections by tytso.
*/

/* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname
* lookup logic.
*/
/* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture.
*/

#include <linux/init.h>
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/wordpart.h>
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/sched/mm.h>
#include <linux/fsnotify.h>
#include <linux/personality.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/mount.h>
#include <linux/audit.h>
#include <linux/capability.h>
#include <linux/file.h>
#include <linux/fcntl.h>
#include <linux/device_cgroup.h>
#include <linux/fs_struct.h>
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
#include <linux/init_task.h>
#include <linux/uaccess.h>

#include "internal.h"
#include "mount.h"

/* [Feb-1997 T. Schoebel-Theuer]
* Fundamental changes in the pathname lookup mechanisms (namei)
* were necessary because of omirr.  The reason is that omirr needs
* to know the _real_ pathname, not the user-supplied one, in case
* of symlinks (and also when transname replacements occur).
*
* The new code replaces the old recursive symlink resolution with
* an iterative one (in case of non-nested symlink chains).  It does
* this with calls to <fs>_follow_link().
* As a side effect, dir_namei(), _namei() and follow_link() are now
* replaced with a single function lookup_dentry() that can handle all
* the special cases of the former code.
*
* With the new dcache, the pathname is stored at each inode, at least as
* long as the refcount of the inode is positive.  As a side effect, the
* size of the dcache depends on the inode cache and thus is dynamic.
*
* [29-Apr-1998 C. Scott Ananian] Updated above description of symlink
* resolution to correspond with current state of the code.
*
* Note that the symlink resolution is not *completely* iterative.
* There is still a significant amount of tail- and mid- recursion in
* the algorithm.  Also, note that <fs>_readlink() is not used in
* lookup_dentry(): lookup_dentry() on the result of <fs>_readlink()
* may return different results than <fs>_follow_link().  Many virtual
* filesystems (including /proc) exhibit this behavior.
*/

/* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation:
* New symlink semantics: when open() is called with flags O_CREAT | O_EXCL
* and the name already exists in form of a symlink, try to create the new
* name indicated by the symlink. The old code always complained that the
* name already exists, due to not following the symlink even if its target
* is nonexistent.  The new semantics affects also mknod() and link() when
* the name is a symlink pointing to a non-existent name.
*
* I don't know which semantics is the right one, since I have no access
* to standards. But I found by trial that HP-UX 9.0 has the full "new"
* semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the
* "old" one. Personally, I think the new semantics is much more logical.
* Note that "ln old new" where "new" is a symlink pointing to a non-existing
* file does succeed in both HP-UX and SunOs, but not in Solaris
* and in the old Linux semantics.
*/

/* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink
* semantics.  See the comments in "open_namei" and "do_link" below.
*
* [10-Sep-98 Alan Modra] Another symlink change.
*/

/* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks:
* inside the path - always follow.
* in the last component in creation/removal/renaming - never follow.
* if LOOKUP_FOLLOW passed - follow.
* if the pathname has trailing slashes - follow.
* otherwise - don't follow.
* (applied in that order).
*
* [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT
* restored for 2.4. This is the last surviving part of old 4.2BSD bug.
* During the 2.4 we need to fix the userland stuff depending on it -
* hopefully we will be able to get rid of that wart in 2.5. So far only
* XEmacs seems to be relying on it...
*/
/*
* [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland)
* implemented.  Let's see if raised priority of ->s_vfs_rename_mutex gives
* any extra contention...
*/

/* In order to reduce some races, while at the same time doing additional
* checking and hopefully speeding things up, we copy filenames to the
* kernel data space before using them..
*
* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
* PATH_MAX includes the nul terminator --RR.
*/

#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))

static inline void initname(struct filename *name, const char __user *uptr)
{
name->uptr = uptr;
name->aname = NULL;
atomic_set(&name->refcnt, 1);
}

struct filename *
getname_flags(const char __user *filename, int flags)
{
struct filename *result;
char *kname;
int len;

result = audit_reusename(filename);
if (result)
  return result;

result = __getname();
if (unlikely(!result))
  return ERR_PTR(-ENOMEM);

/*
* First, try to embed the struct filename inside the names_cache
* allocation
*/
kname = (char *)result->iname;
result->name = kname;

len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
/*
* Handle both empty path and copy failure in one go.
*/
if (unlikely(len <= 0)) {
  if (unlikely(len < 0)) {
   __putname(result);
   return ERR_PTR(len);
  }

  /* The empty path is special. */
  if (!(flags & LOOKUP_EMPTY)) {
   __putname(result);
   return ERR_PTR(-ENOENT);
  }
}

/*
* Uh-oh. We have a name that's approaching PATH_MAX. Allocate a
* separate struct filename so we can dedicate the entire
* names_cache allocation for the pathname, and re-do the copy from
* userland.
*/
if (unlikely(len == EMBEDDED_NAME_MAX)) {
  const size_t size = offsetof(struct filename, iname[1]);
  kname = (char *)result;

  /*
* size is chosen that way we to guarantee that
* result->iname[0] is within the same object and that
* kname can't be equal to result->iname, no matter what.
*/
  result = kzalloc(size, GFP_KERNEL);
  if (unlikely(!result)) {
   __putname(kname);
   return ERR_PTR(-ENOMEM);
  }
  result->name = kname;
  len = strncpy_from_user(kname, filename, PATH_MAX);
  if (unlikely(len < 0)) {
   __putname(kname);
   kfree(result);
   return ERR_PTR(len);
  }
  /* The empty path is special. */
  if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) {
   __putname(kname);
   kfree(result);
   return ERR_PTR(-ENOENT);
  }
  if (unlikely(len == PATH_MAX)) {
   __putname(kname);
   kfree(result);
   return ERR_PTR(-ENAMETOOLONG);
  }
}
initname(result, filename);
audit_getname(result);
return result;
}

struct filename *getname_uflags(const char __user *filename, int uflags)
{
int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;

return getname_flags(filename, flags);
}

struct filename *__getname_maybe_null(const char __user *pathname)
{
struct filename *name;
char c;

/* try to save on allocations; loss on um, though */
if (get_user(c, pathname))
  return ERR_PTR(-EFAULT);
if (!c)
  return NULL;

name = getname_flags(pathname, LOOKUP_EMPTY);
if (!IS_ERR(name) && !(name->name[0])) {
  putname(name);
  name = NULL;
}
return name;
}

struct filename *getname_kernel(const char * filename)
{
struct filename *result;
int len = strlen(filename) + 1;

result = __getname();
if (unlikely(!result))
  return ERR_PTR(-ENOMEM);

if (len <= EMBEDDED_NAME_MAX) {
  result->name = (char *)result->iname;
} else if (len <= PATH_MAX) {
  const size_t size = offsetof(struct filename, iname[1]);
  struct filename *tmp;

  tmp = kmalloc(size, GFP_KERNEL);
  if (unlikely(!tmp)) {
   __putname(result);
   return ERR_PTR(-ENOMEM);
  }
  tmp->name = (char *)result;
  result = tmp;
} else {
  __putname(result);
  return ERR_PTR(-ENAMETOOLONG);
}
memcpy((char *)result->name, filename, len);
initname(result, NULL);
audit_getname(result);
return result;
}
EXPORT_SYMBOL(getname_kernel);

void putname(struct filename *name)
{
int refcnt;

if (IS_ERR_OR_NULL(name))
  return;

refcnt = atomic_read(&name->refcnt);
if (refcnt != 1) {
  if (WARN_ON_ONCE(!refcnt))
   return;

  if (!atomic_dec_and_test(&name->refcnt))
   return;
}

if (name->name != name->iname) {
  __putname(name->name);
  kfree(name);
} else
  __putname(name);
}
EXPORT_SYMBOL(putname);

/**
* check_acl - perform ACL permission checking
* @idmap: idmap of the mount the inode was found from
* @inode: inode to check permissions on
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
*
* This function performs the ACL permission checking. Since this function
* retrieve POSIX acls it needs to know whether it is called from a blocking or
* non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
*
* If the inode has been found through an idmapped mount the idmap of
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
*/
static int check_acl(struct mnt_idmap *idmap,
       struct inode *inode, int mask)
{
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *acl;

if (mask & MAY_NOT_BLOCK) {
  acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS);
         if (!acl)
                 return -EAGAIN;
  /* no ->get_inode_acl() calls in RCU mode... */
  if (is_uncached_acl(acl))
   return -ECHILD;
         return posix_acl_permission(idmap, inode, acl, mask);
}

acl = get_inode_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(acl))
  return PTR_ERR(acl);
if (acl) {
         int error = posix_acl_permission(idmap, inode, acl, mask);
         posix_acl_release(acl);
         return error;
}
#endif

return -EAGAIN;
}

/*
* Very quick optimistic "we know we have no ACL's" check.
*
* Note that this is purely for ACL_TYPE_ACCESS, and purely
* for the "we have cached that there are no ACLs" case.
*
* If this returns true, we know there are no ACLs. But if
* it returns false, we might still not have ACLs (it could
* be the is_uncached_acl() case).
*/
static inline bool no_acl_inode(struct inode *inode)
{
#ifdef CONFIG_FS_POSIX_ACL
return likely(!READ_ONCE(inode->i_acl));
#else
return true;
#endif
}

/**
* acl_permission_check - perform basic UNIX permission checking
* @idmap: idmap of the mount the inode was found from
* @inode: inode to check permissions on
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
*
* This function performs the basic UNIX permission checking. Since this
* function may retrieve POSIX acls it needs to know whether it is called from a
* blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit.
*
* If the inode has been found through an idmapped mount the idmap of
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
*/
static int acl_permission_check(struct mnt_idmap *idmap,
    struct inode *inode, int mask)
{
unsigned int mode = inode->i_mode;
vfsuid_t vfsuid;

/*
* Common cheap case: everybody has the requested
* rights, and there are no ACLs to check. No need
* to do any owner/group checks in that case.
*
*  - 'mask&7' is the requested permission bit set
*  - multiplying by 0111 spreads them out to all of ugo
*  - '& ~mode' looks for missing inode permission bits
*  - the '!' is for "no missing permissions"
*
* After that, we just need to check that there are no
* ACL's on the inode - do the 'IS_POSIXACL()' check last
* because it will dereference the ->i_sb pointer and we
* want to avoid that if at all possible.
*/
if (!((mask & 7) * 0111 & ~mode)) {
  if (no_acl_inode(inode))
   return 0;
  if (!IS_POSIXACL(inode))
   return 0;
}

/* Are we the owner? If so, ACL's don't matter */
vfsuid = i_uid_into_vfsuid(idmap, inode);
if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
  mask &= 7;
  mode >>= 6;
  return (mask & ~mode) ? -EACCES : 0;
}

/* Do we have ACL's? */
if (IS_POSIXACL(inode) && (mode & S_IRWXG)) {
  int error = check_acl(idmap, inode, mask);
  if (error != -EAGAIN)
   return error;
}

/* Only RWX matters for group/other mode bits */
mask &= 7;

/*
* Are the group permissions different from
* the other permissions in the bits we care
* about? Need to check group ownership if so.
*/
if (mask & (mode ^ (mode >> 3))) {
  vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
  if (vfsgid_in_group_p(vfsgid))
   mode >>= 3;
}

/* Bits in 'mode' clear that we require? */
return (mask & ~mode) ? -EACCES : 0;
}

/**
* generic_permission -  check for access rights on a Posix-like filesystem
* @idmap: idmap of the mount the inode was found from
* @inode: inode to check access rights for
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC,
* %MAY_NOT_BLOCK ...)
*
* Used to check for read/write/execute permissions on a file.
* We use "fsuid" for this, letting us set arbitrary permissions
* for filesystem access without changing the "normal" uids which
* are used for other things.
*
* generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk
* request cannot be satisfied (eg. requires blocking or too much complexity).
* It would then be called again in ref-walk mode.
*
* If the inode has been found through an idmapped mount the idmap of
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
*/
int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
         int mask)
{
int ret;

/*
* Do the basic permission checks.
*/
ret = acl_permission_check(idmap, inode, mask);
if (ret != -EACCES)
  return ret;

if (S_ISDIR(inode->i_mode)) {
  /* DACs are overridable for directories */
  if (!(mask & MAY_WRITE))
   if (capable_wrt_inode_uidgid(idmap, inode,
           CAP_DAC_READ_SEARCH))
    return 0;
  if (capable_wrt_inode_uidgid(idmap, inode,
          CAP_DAC_OVERRIDE))
   return 0;
  return -EACCES;
}

/*
* Searching includes executable on directories, else just read.
*/
mask &= MAY_READ | MAY_WRITE | MAY_EXEC;
if (mask == MAY_READ)
  if (capable_wrt_inode_uidgid(idmap, inode,
          CAP_DAC_READ_SEARCH))
   return 0;
/*
* Read/write DACs are always overridable.
* Executable DACs are overridable when there is
* at least one exec bit set.
*/
if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO))
  if (capable_wrt_inode_uidgid(idmap, inode,
          CAP_DAC_OVERRIDE))
   return 0;

return -EACCES;
}
EXPORT_SYMBOL(generic_permission);

/**
* do_inode_permission - UNIX permission checking
* @idmap: idmap of the mount the inode was found from
* @inode: inode to check permissions on
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...)
*
* We _really_ want to just do "generic_permission()" without
* even looking at the inode->i_op values. So we keep a cache
* flag in inode->i_opflags, that says "this has not special
* permission function, use the fast case".
*/
static inline int do_inode_permission(struct mnt_idmap *idmap,
          struct inode *inode, int mask)
{
if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
  if (likely(inode->i_op->permission))
   return inode->i_op->permission(idmap, inode, mask);

  /* This gets set once for the inode lifetime */
  spin_lock(&inode->i_lock);
  inode->i_opflags |= IOP_FASTPERM;
  spin_unlock(&inode->i_lock);
}
return generic_permission(idmap, inode, mask);
}

/**
* sb_permission - Check superblock-level permissions
* @sb: Superblock of inode to check permission on
* @inode: Inode to check permission on
* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Separate out file-system wide checks from inode-specific permission checks.
*/
static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
{
if (unlikely(mask & MAY_WRITE)) {
  umode_t mode = inode->i_mode;

  /* Nobody gets write access to a read-only fs. */
  if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
   return -EROFS;
}
return 0;
}

/**
* inode_permission - Check for access rights to a given inode
* @idmap: idmap of the mount the inode was found from
* @inode: Inode to check permission on
* @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
*
* Check for read/write/execute permissions on an inode.  We use fs[ug]id for
* this, letting us set arbitrary permissions for filesystem access without
* changing the "normal" UIDs which are used for other things.
*
* When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
*/
int inode_permission(struct mnt_idmap *idmap,
       struct inode *inode, int mask)
{
int retval;

retval = sb_permission(inode->i_sb, inode, mask);
if (unlikely(retval))
  return retval;

if (unlikely(mask & MAY_WRITE)) {
  /*
* Nobody gets write access to an immutable file.
*/
  if (unlikely(IS_IMMUTABLE(inode)))
   return -EPERM;

  /*
* Updating mtime will likely cause i_uid and i_gid to be
* written back improperly if their true value is unknown
* to the vfs.
*/
  if (unlikely(HAS_UNMAPPED_ID(idmap, inode)))
   return -EACCES;
}

retval = do_inode_permission(idmap, inode, mask);
if (unlikely(retval))
  return retval;

retval = devcgroup_inode_permission(inode, mask);
if (unlikely(retval))
  return retval;

return security_inode_permission(inode, mask);
}
EXPORT_SYMBOL(inode_permission);

/**
* path_get - get a reference to a path
* @path: path to get the reference to
*
* Given a path increment the reference count to the dentry and the vfsmount.
*/
void path_get(const struct path *path)
{
mntget(path->mnt);
dget(path->dentry);
}
EXPORT_SYMBOL(path_get);

/**
* path_put - put a reference to a path
* @path: path to put the reference to
*
* Given a path decrement the reference count to the dentry and the vfsmount.
*/
void path_put(const struct path *path)
{
dput(path->dentry);
mntput(path->mnt);
}
EXPORT_SYMBOL(path_put);

#define EMBEDDED_LEVELS 2
struct nameidata {
struct path path;
struct qstr last;
struct path root;
struct inode *inode; /* path.dentry.d_inode */
unsigned int flags, state;
unsigned seq, next_seq, m_seq, r_seq;
int  last_type;
unsigned depth;
int  total_link_count;
struct saved {
  struct path link;
  struct delayed_call done;
  const char *name;
  unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
const char *pathname;
struct nameidata *saved;
unsigned root_seq;
int  dfd;
vfsuid_t dir_vfsuid;
umode_t  dir_mode;
} __randomize_layout;

#define ND_ROOT_PRESET 1
#define ND_ROOT_GRABBED 2
#define ND_JUMPED 4

static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
struct nameidata *old = current->nameidata;
p->stack = p->internal;
p->depth = 0;
p->dfd = dfd;
p->name = name;
p->pathname = likely(name) ? name->name : "";
p->path.mnt = NULL;
p->path.dentry = NULL;
p->total_link_count = old ? old->total_link_count : 0;
p->saved = old;
current->nameidata = p;
}

static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name,
     const struct path *root)
{
__set_nameidata(p, dfd, name);
p->state = 0;
if (unlikely(root)) {
  p->state = ND_ROOT_PRESET;
  p->root = *root;
}
}

static void restore_nameidata(void)
{
struct nameidata *now = current->nameidata, *old = now->saved;

current->nameidata = old;
if (old)
  old->total_link_count = now->total_link_count;
if (now->stack != now->internal)
  kfree(now->stack);
}

static bool nd_alloc_stack(struct nameidata *nd)
{
struct saved *p;

p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved),
    nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL);
if (unlikely(!p))
  return false;
memcpy(p, nd->internal, sizeof(nd->internal));
nd->stack = p;
return true;
}

/**
* path_connected - Verify that a dentry is below mnt.mnt_root
* @mnt: The mountpoint to check.
* @dentry: The dentry to check.
*
* Rename can sometimes move a file or directory outside of a bind
* mount, path_connected allows those cases to be detected.
*/
static bool path_connected(struct vfsmount *mnt, struct dentry *dentry)
{
struct super_block *sb = mnt->mnt_sb;

/* Bind mounts can have disconnected paths */
if (mnt->mnt_root == sb->s_root)
  return true;

return is_subdir(dentry, mnt->mnt_root);
}

static void drop_links(struct nameidata *nd)
{
int i = nd->depth;
while (i--) {
  struct saved *last = nd->stack + i;
  do_delayed_call(&last->done);
  clear_delayed_call(&last->done);
}
}

static void leave_rcu(struct nameidata *nd)
{
nd->flags &= ~LOOKUP_RCU;
nd->seq = nd->next_seq = 0;
rcu_read_unlock();
}

static void terminate_walk(struct nameidata *nd)
{
drop_links(nd);
if (!(nd->flags & LOOKUP_RCU)) {
  int i;
  path_put(&nd->path);
  for (i = 0; i < nd->depth; i++)
   path_put(&nd->stack[i].link);
  if (nd->state & ND_ROOT_GRABBED) {
   path_put(&nd->root);
   nd->state &= ~ND_ROOT_GRABBED;
  }
} else {
  leave_rcu(nd);
}
nd->depth = 0;
nd->path.mnt = NULL;
nd->path.dentry = NULL;
}

/* path_put is needed afterwards regardless of success or failure */
static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq)
{
int res = __legitimize_mnt(path->mnt, mseq);
if (unlikely(res)) {
  if (res > 0)
   path->mnt = NULL;
  path->dentry = NULL;
  return false;
}
if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) {
  path->dentry = NULL;
  return false;
}
return !read_seqcount_retry(&path->dentry->d_seq, seq);
}

static inline bool legitimize_path(struct nameidata *nd,
       struct path *path, unsigned seq)
{
return __legitimize_path(path, seq, nd->m_seq);
}

static bool legitimize_links(struct nameidata *nd)
{
int i;
if (unlikely(nd->flags & LOOKUP_CACHED)) {
  drop_links(nd);
  nd->depth = 0;
  return false;
}
for (i = 0; i < nd->depth; i++) {
  struct saved *last = nd->stack + i;
  if (unlikely(!legitimize_path(nd, &last->link, last->seq))) {
   drop_links(nd);
   nd->depth = i + 1;
   return false;
  }
}
return true;
}

static bool legitimize_root(struct nameidata *nd)
{
/* Nothing to do if nd->root is zero or is managed by the VFS user. */
if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET))
  return true;
nd->state |= ND_ROOT_GRABBED;
return legitimize_path(nd, &nd->root, nd->root_seq);
}

/*
* Path walking has 2 modes, rcu-walk and ref-walk (see
* Documentation/filesystems/path-lookup.txt).  In situations when we can't
* continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
* normal reference counts on dentries and vfsmounts to transition to ref-walk
* mode.  Refcounts are grabbed at the last known good point before rcu-walk
* got stuck, so ref-walk may continue from there. If this is not successful
* (eg. a seqcount has changed), then failure is returned and it's up to caller
* to restart the path walk from the beginning in ref-walk mode.
*/

/**
* try_to_unlazy - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
* Returns: true on success, false on failure
*
* try_to_unlazy attempts to legitimize the current nd->path and nd->root
* for ref-walk mode.
* Must be called from rcu-walk context.
* Nothing should touch nameidata between try_to_unlazy() failure and
* terminate_walk().
*/
static bool try_to_unlazy(struct nameidata *nd)
{
struct dentry *parent = nd->path.dentry;

BUG_ON(!(nd->flags & LOOKUP_RCU));

if (unlikely(!legitimize_links(nd)))
  goto out1;
if (unlikely(!legitimize_path(nd, &nd->path, nd->seq)))
  goto out;
if (unlikely(!legitimize_root(nd)))
  goto out;
leave_rcu(nd);
BUG_ON(nd->inode != parent->d_inode);
return true;

out1:
nd->path.mnt = NULL;
nd->path.dentry = NULL;
out:
leave_rcu(nd);
return false;
}

/**
* try_to_unlazy_next - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
* @dentry: next dentry to step into
* Returns: true on success, false on failure
*
* Similar to try_to_unlazy(), but here we have the next dentry already
* picked by rcu-walk and want to legitimize that in addition to the current
* nd->path and nd->root for ref-walk mode.  Must be called from rcu-walk context.
* Nothing should touch nameidata between try_to_unlazy_next() failure and
* terminate_walk().
*/
static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry)
{
int res;
BUG_ON(!(nd->flags & LOOKUP_RCU));

if (unlikely(!legitimize_links(nd)))
  goto out2;
res = __legitimize_mnt(nd->path.mnt, nd->m_seq);
if (unlikely(res)) {
  if (res > 0)
   goto out2;
  goto out1;
}
if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref)))
  goto out1;

/*
* We need to move both the parent and the dentry from the RCU domain
* to be properly refcounted. And the sequence number in the dentry
* validates *both* dentry counters, since we checked the sequence
* number of the parent after we got the child sequence number. So we
* know the parent must still be valid if the child sequence number is
*/
if (unlikely(!lockref_get_not_dead(&dentry->d_lockref)))
  goto out;
if (read_seqcount_retry(&dentry->d_seq, nd->next_seq))
  goto out_dput;
/*
* Sequence counts matched. Now make sure that the root is
* still valid and get it if required.
*/
if (unlikely(!legitimize_root(nd)))
  goto out_dput;
leave_rcu(nd);
return true;

out2:
nd->path.mnt = NULL;
out1:
nd->path.dentry = NULL;
out:
leave_rcu(nd);
return false;
out_dput:
leave_rcu(nd);
dput(dentry);
return false;
}

static inline int d_revalidate(struct inode *dir, const struct qstr *name,
          struct dentry *dentry, unsigned int flags)
{
if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
  return dentry->d_op->d_revalidate(dir, name, dentry, flags);
else
  return 1;
}

/**
* complete_walk - successful completion of path walk
* @nd:  pointer nameidata
*
* If we had been in RCU mode, drop out of it and legitimize nd->path.
* Revalidate the final result, unless we'd already done that during
* the path walk or the filesystem doesn't ask for it.  Return 0 on
* success, -error on failure.  In case of failure caller does not
* need to drop nd->path.
*/
static int complete_walk(struct nameidata *nd)
{
struct dentry *dentry = nd->path.dentry;
int status;

if (nd->flags & LOOKUP_RCU) {
  /*
* We don't want to zero nd->root for scoped-lookups or
* externally-managed nd->root.
*/
  if (!(nd->state & ND_ROOT_PRESET))
   if (!(nd->flags & LOOKUP_IS_SCOPED))
    nd->root.mnt = NULL;
  nd->flags &= ~LOOKUP_CACHED;
  if (!try_to_unlazy(nd))
   return -ECHILD;
}

if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
  /*
* While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't
* ever step outside the root during lookup" and should already
* be guaranteed by the rest of namei, we want to avoid a namei
* BUG resulting in userspace being given a path that was not
* scoped within the root at some point during the lookup.
*
* So, do a final sanity-check to make sure that in the
* worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED)
* we won't silently return an fd completely outside of the
* requested root to userspace.
*
* Userspace could move the path outside the root after this
* check, but as discussed elsewhere this is not a concern (the
* resolved file was inside the root at some point).
*/
  if (!path_is_under(&nd->path, &nd->root))
   return -EXDEV;
}

if (likely(!(nd->state & ND_JUMPED)))
  return 0;

if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE)))
  return 0;

status = dentry->d_op->d_weak_revalidate(dentry, nd->flags);
if (status > 0)
  return 0;

if (!status)
  status = -ESTALE;

return status;
}

static int set_root(struct nameidata *nd)
{
struct fs_struct *fs = current->fs;

/*
* Jumping to the real root in a scoped-lookup is a BUG in namei, but we
* still have to ensure it doesn't happen because it will cause a breakout
* from the dirfd.
*/
if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED))
  return -ENOTRECOVERABLE;

if (nd->flags & LOOKUP_RCU) {
  unsigned seq;

  do {
   seq = read_seqbegin(&fs->seq);
   nd->root = fs->root;
   nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
  } while (read_seqretry(&fs->seq, seq));
} else {
  get_fs_root(fs, &nd->root);
  nd->state |= ND_ROOT_GRABBED;
}
return 0;
}

static int nd_jump_root(struct nameidata *nd)
{
if (unlikely(nd->flags & LOOKUP_BENEATH))
  return -EXDEV;
if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
  /* Absolute path arguments to path_init() are allowed. */
  if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt)
   return -EXDEV;
}
if (!nd->root.mnt) {
  int error = set_root(nd);
  if (error)
   return error;
}
if (nd->flags & LOOKUP_RCU) {
  struct dentry *d;
  nd->path = nd->root;
  d = nd->path.dentry;
  nd->inode = d->d_inode;
  nd->seq = nd->root_seq;
  if (read_seqcount_retry(&d->d_seq, nd->seq))
   return -ECHILD;
} else {
  path_put(&nd->path);
  nd->path = nd->root;
  path_get(&nd->path);
  nd->inode = nd->path.dentry->d_inode;
}
nd->state |= ND_JUMPED;
return 0;
}

/*
* Helper to directly jump to a known parsed path from ->get_link,
* caller must have taken a reference to path beforehand.
*/
int nd_jump_link(const struct path *path)
{
int error = -ELOOP;
struct nameidata *nd = current->nameidata;

if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS))
  goto err;

error = -EXDEV;
if (unlikely(nd->flags & LOOKUP_NO_XDEV)) {
  if (nd->path.mnt != path->mnt)
   goto err;
}
/* Not currently safe for scoped-lookups. */
if (unlikely(nd->flags & LOOKUP_IS_SCOPED))
  goto err;

path_put(&nd->path);
nd->path = *path;
nd->inode = nd->path.dentry->d_inode;
nd->state |= ND_JUMPED;
return 0;

err:
path_put(path);
return error;
}

static inline void put_link(struct nameidata *nd)
{
struct saved *last = nd->stack + --nd->depth;
do_delayed_call(&last->done);
if (!(nd->flags & LOOKUP_RCU))
  path_put(&last->link);
}

static int sysctl_protected_symlinks __read_mostly;
static int sysctl_protected_hardlinks __read_mostly;
static int sysctl_protected_fifos __read_mostly;
static int sysctl_protected_regular __read_mostly;

#ifdef CONFIG_SYSCTL
static const struct ctl_table namei_sysctls[] = {
{
  .procname = "protected_symlinks",
  .data  = &sysctl_protected_symlinks,
  .maxlen  = sizeof(int),
  .mode  = 0644,
  .proc_handler = proc_dointvec_minmax,
  .extra1  = SYSCTL_ZERO,
  .extra2  = SYSCTL_ONE,
},
{
  .procname = "protected_hardlinks",
  .data  = &sysctl_protected_hardlinks,
  .maxlen  = sizeof(int),
  .mode  = 0644,
  .proc_handler = proc_dointvec_minmax,
  .extra1  = SYSCTL_ZERO,
  .extra2  = SYSCTL_ONE,
},
{
  .procname = "protected_fifos",
  .data  = &sysctl_protected_fifos,
  .maxlen  = sizeof(int),
  .mode  = 0644,
  .proc_handler = proc_dointvec_minmax,
  .extra1  = SYSCTL_ZERO,
  .extra2  = SYSCTL_TWO,
},
{
  .procname = "protected_regular",
  .data  = &sysctl_protected_regular,
  .maxlen  = sizeof(int),
  .mode  = 0644,
  .proc_handler = proc_dointvec_minmax,
  .extra1  = SYSCTL_ZERO,
  .extra2  = SYSCTL_TWO,
},
};

static int __init init_fs_namei_sysctls(void)
{
register_sysctl_init("fs", namei_sysctls);
return 0;
}
fs_initcall(init_fs_namei_sysctls);

#endif /* CONFIG_SYSCTL */

/**
* may_follow_link - Check symlink following for unsafe situations
* @nd: nameidata pathwalk data
* @inode: Used for idmapping.
*
* In the case of the sysctl_protected_symlinks sysctl being enabled,
* CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is
* in a sticky world-writable directory. This is to protect privileged
* processes from failing races against path names that may change out
* from under them by way of other users creating malicious symlinks.
* It will permit symlinks to be followed only when outside a sticky
* world-writable directory, or when the uid of the symlink and follower
* match, or when the directory owner matches the symlink's owner.
*
* Returns 0 if following the symlink is allowed, -ve on error.
*/
static inline int may_follow_link(struct nameidata *nd, const struct inode *inode)
{
struct mnt_idmap *idmap;
vfsuid_t vfsuid;

if (!sysctl_protected_symlinks)
  return 0;

idmap = mnt_idmap(nd->path.mnt);
vfsuid = i_uid_into_vfsuid(idmap, inode);
/* Allowed if owner and follower match. */
if (vfsuid_eq_kuid(vfsuid, current_fsuid()))
  return 0;

/* Allowed if parent directory not sticky and world-writable. */
if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH))
  return 0;

/* Allowed if parent directory and link owner match. */
if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid))
  return 0;

if (nd->flags & LOOKUP_RCU)
  return -ECHILD;

audit_inode(nd->name, nd->stack[0].link.dentry, 0);
audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link");
return -EACCES;
}

/**
* safe_hardlink_source - Check for safe hardlink conditions
* @idmap: idmap of the mount the inode was found from
* @inode: the source inode to hardlink from
*
* Return false if at least one of the following conditions:
*    - inode is not a regular file
*    - inode is setuid
*    - inode is setgid and group-exec
*    - access failure for read and write
*
* Otherwise returns true.
*/
static bool safe_hardlink_source(struct mnt_idmap *idmap,
     struct inode *inode)
{
umode_t mode = inode->i_mode;

/* Special files should not get pinned to the filesystem. */
if (!S_ISREG(mode))
  return false;

/* Setuid files should not get pinned to the filesystem. */
if (mode & S_ISUID)
  return false;

/* Executable setgid files should not get pinned to the filesystem. */
if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
  return false;

/* Hardlinking to unreadable or unwritable sources is dangerous. */
if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE))
  return false;

return true;
}

/**
* may_linkat - Check permissions for creating a hardlink
* @idmap: idmap of the mount the inode was found from
* @link:  the source to hardlink from
*
* Block hardlink when all of:
*  - sysctl_protected_hardlinks enabled
*  - fsuid does not match inode
*  - hardlink source is unsafe (see safe_hardlink_source() above)
*  - not CAP_FOWNER in a namespace with the inode owner uid mapped
*
* If the inode has been found through an idmapped mount the idmap of
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
*
* Returns 0 if successful, -ve on error.
*/
int may_linkat(struct mnt_idmap *idmap, const struct path *link)
{
struct inode *inode = link->dentry->d_inode;

/* Inode writeback is not safe when the uid or gid are invalid. */
if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) ||
     !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)))
  return -EOVERFLOW;

if (!sysctl_protected_hardlinks)
  return 0;

/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
* otherwise, it must be a safe source.
*/
if (safe_hardlink_source(idmap, inode) ||
     inode_owner_or_capable(idmap, inode))
  return 0;

audit_log_path_denied(AUDIT_ANOM_LINK, "linkat");
return -EPERM;
}

/**
* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
*   should be allowed, or not, on files that already
*   exist.
* @idmap: idmap of the mount the inode was found from
* @nd: nameidata pathwalk data
* @inode: the inode of the file to open
*
* Block an O_CREAT open of a FIFO (or a regular file) when:
*   - sysctl_protected_fifos (or sysctl_protected_regular) is enabled
*   - the file already exists
*   - we are in a sticky directory
*   - we don't own the file
*   - the owner of the directory doesn't own the file
*   - the directory is world writable
* If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2
* the directory doesn't have to be world writable: being group writable will
* be enough.
*
* If the inode has been found through an idmapped mount the idmap of
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
* raw inode simply pass @nop_mnt_idmap.
*
* Returns 0 if the open is allowed, -ve on error.
*/
static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd,
    struct inode *const inode)
{
umode_t dir_mode = nd->dir_mode;
vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid;

if (likely(!(dir_mode & S_ISVTX)))
  return 0;

if (S_ISREG(inode->i_mode) && !sysctl_protected_regular)
  return 0;

if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos)
  return 0;

i_vfsuid = i_uid_into_vfsuid(idmap, inode);

if (vfsuid_eq(i_vfsuid, dir_vfsuid))
  return 0;

if (vfsuid_eq_kuid(i_vfsuid, current_fsuid()))
  return 0;

if (likely(dir_mode & 0002)) {
  audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create");
  return -EACCES;
}

if (dir_mode & 0020) {
  if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) {
   audit_log_path_denied(AUDIT_ANOM_CREAT,
           "sticky_create_fifo");
   return -EACCES;
  }

  if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) {
   audit_log_path_denied(AUDIT_ANOM_CREAT,
           "sticky_create_regular");
   return -EACCES;
  }
}

return 0;
}

/*
* follow_up - Find the mountpoint of path's vfsmount
*
* Given a path, find the mountpoint of its source file system.
* Replace @path with the path of the mountpoint in the parent mount.
* Up is towards /.
*
* Return 1 if we went up a level and 0 if we were already at the
* root.
*/
int follow_up(struct path *path)
{
struct mount *mnt = real_mount(path->mnt);
struct mount *parent;
struct dentry *mountpoint;

read_seqlock_excl(&mount_lock);
parent = mnt->mnt_parent;
if (parent == mnt) {
  read_sequnlock_excl(&mount_lock);
  return 0;
}
mntget(&parent->mnt);
mountpoint = dget(mnt->mnt_mountpoint);
read_sequnlock_excl(&mount_lock);
dput(path->dentry);
path->dentry = mountpoint;
mntput(path->mnt);
path->mnt = &parent->mnt;
return 1;
}
EXPORT_SYMBOL(follow_up);

static bool choose_mountpoint_rcu(struct mount *m, const struct path *root,
      struct path *path, unsigned *seqp)
{
while (mnt_has_parent(m)) {
  struct dentry *mountpoint = m->mnt_mountpoint;

  m = m->mnt_parent;
  if (unlikely(root->dentry == mountpoint &&
        root->mnt == &m->mnt))
   break;
  if (mountpoint != m->mnt.mnt_root) {
   path->mnt = &m->mnt;
   path->dentry = mountpoint;
   *seqp = read_seqcount_begin(&mountpoint->d_seq);
   return true;
  }
}
return false;
}

static bool choose_mountpoint(struct mount *m, const struct path *root,
         struct path *path)
{
bool found;

rcu_read_lock();
while (1) {
  unsigned seq, mseq = read_seqbegin(&mount_lock);

  found = choose_mountpoint_rcu(m, root, path, &seq);
  if (unlikely(!found)) {
   if (!read_seqretry(&mount_lock, mseq))
    break;
  } else {
   if (likely(__legitimize_path(path, seq, mseq)))
    break;
   rcu_read_unlock();
   path_put(path);
   rcu_read_lock();
  }
}
rcu_read_unlock();
return found;
}

/*
* Perform an automount
* - return -EISDIR to tell follow_managed() to stop and return the path we
*   were called with.
*/
static int follow_automount(struct path *path, int *count, unsigned lookup_flags)
{
struct dentry *dentry = path->dentry;

/* We don't want to mount if someone's just doing a stat -
* unless they're stat'ing a directory and appended a '/' to
* the name.
*
* We do, however, want to mount if someone wants to open or
* create a file of any type under the mountpoint, wants to
* traverse through the mountpoint or wants to open the
* mounted directory.  Also, autofs may mark negative dentries
* as being automount points.  These will need the attentions
* of the daemon to instantiate them before they can be used.
*/
if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
      LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
     dentry->d_inode)
  return -EISDIR;

/* No need to trigger automounts if mountpoint crossing is disabled. */
if (lookup_flags & LOOKUP_NO_XDEV)
  return -EXDEV;

if (count && (*count)++ >= MAXSYMLINKS)
  return -ELOOP;

return finish_automount(dentry->d_op->d_automount(path), path);
}

/*
* mount traversal - out-of-line part.  One note on ->d_flags accesses -
* dentries are pinned but not locked here, so negative dentry can go
* positive right under us.  Use of smp_load_acquire() provides a barrier
* sufficient for ->d_inode and ->d_flags consistency.
*/
static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
        int *count, unsigned lookup_flags)
{
struct vfsmount *mnt = path->mnt;
bool need_mntput = false;
int ret = 0;

while (flags & DCACHE_MANAGED_DENTRY) {
  /* Allow the filesystem to manage the transit without i_rwsem
* being held. */
  if (flags & DCACHE_MANAGE_TRANSIT) {
   if (lookup_flags & LOOKUP_NO_XDEV) {
    ret = -EXDEV;
    break;
   }
   ret = path->dentry->d_op->d_manage(path, false);
   flags = smp_load_acquire(&path->dentry->d_flags);
   if (ret < 0)
    break;
  }

  if (flags & DCACHE_MOUNTED) { // something's mounted on it..
   struct vfsmount *mounted = lookup_mnt(path);
   if (mounted) {  // ... in our namespace
    dput(path->dentry);
    if (need_mntput)
     mntput(path->mnt);
    path->mnt = mounted;
    path->dentry = dget(mounted->mnt_root);
    // here we know it's positive
    flags = path->dentry->d_flags;
    need_mntput = true;
    continue;
   }
  }

  if (!(flags & DCACHE_NEED_AUTOMOUNT))
   break;

  // uncovered automount point
  ret = follow_automount(path, count, lookup_flags);
  flags = smp_load_acquire(&path->dentry->d_flags);
  if (ret < 0)
   break;
}

if (ret == -EISDIR)
  ret = 0;
// possible if you race with several mount --move
if (need_mntput && path->mnt == mnt)
  mntput(path->mnt);
if (!ret && unlikely(d_flags_negative(flags)))
  ret = -ENOENT;
*jumped = need_mntput;
return ret;
}

static inline int traverse_mounts(struct path *path, bool *jumped,
      int *count, unsigned lookup_flags)
{
unsigned flags = smp_load_acquire(&path->dentry->d_flags);

/* fastpath */
if (likely(!(flags & DCACHE_MANAGED_DENTRY))) {
  *jumped = false;
  if (unlikely(d_flags_negative(flags)))
   return -ENOENT;
  return 0;
}
return __traverse_mounts(path, flags, jumped, count, lookup_flags);
}

int follow_down_one(struct path *path)
{
struct vfsmount *mounted;

mounted = lookup_mnt(path);
if (mounted) {
  dput(path->dentry);
  mntput(path->mnt);
  path->mnt = mounted;
  path->dentry = dget(mounted->mnt_root);
  return 1;
}
return 0;
}
EXPORT_SYMBOL(follow_down_one);

/*
* Follow down to the covering mount currently visible to userspace.  At each
* point, the filesystem owning that dentry may be queried as to whether the
* caller is permitted to proceed or not.
*/
int follow_down(struct path *path, unsigned int flags)
{
struct vfsmount *mnt = path->mnt;
bool jumped;
int ret = traverse_mounts(path, &jumped, NULL, flags);

if (path->mnt != mnt)
  mntput(mnt);
return ret;
}
EXPORT_SYMBOL(follow_down);

/*
* Try to skip to top of mountpoint pile in rcuwalk mode.  Fail if
* we meet a managed dentry that would need blocking.
*/
static bool __follow_mount_rcu(struct nameidata *nd, struct path *path)
{
struct dentry *dentry = path->dentry;
unsigned int flags = dentry->d_flags;

if (likely(!(flags & DCACHE_MANAGED_DENTRY)))
  return true;

if (unlikely(nd->flags & LOOKUP_NO_XDEV))
  return false;

for (;;) {
  /*
* Don't forget we might have a non-mountpoint managed dentry
* that wants to block transit.
*/
  if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) {
   int res = dentry->d_op->d_manage(path, true);
   if (res)
    return res == -EISDIR;
   flags = dentry->d_flags;
  }

  if (flags & DCACHE_MOUNTED) {
   struct mount *mounted = __lookup_mnt(path->mnt, dentry);
   if (mounted) {
    path->mnt = &mounted->mnt;
    dentry = path->dentry = mounted->mnt.mnt_root;
    nd->state |= ND_JUMPED;
    nd->next_seq = read_seqcount_begin(&dentry->d_seq);
    flags = dentry->d_flags;
    // makes sure that non-RCU pathwalk could reach
    // this state.
    if (read_seqretry(&mount_lock, nd->m_seq))
     return false;
    continue;
   }
   if (read_seqretry(&mount_lock, nd->m_seq))
    return false;
  }
  return !(flags & DCACHE_NEED_AUTOMOUNT);
}
}

static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
     struct path *path)
{
bool jumped;
int ret;

path->mnt = nd->path.mnt;
path->dentry = dentry;
if (nd->flags & LOOKUP_RCU) {
  unsigned int seq = nd->next_seq;
  if (likely(__follow_mount_rcu(nd, path)))
   return 0;
  // *path and nd->next_seq might've been clobbered
  path->mnt = nd->path.mnt;
  path->dentry = dentry;
  nd->next_seq = seq;
  if (!try_to_unlazy_next(nd, dentry))
   return -ECHILD;
}
ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
if (jumped) {
  if (unlikely(nd->flags & LOOKUP_NO_XDEV))
   ret = -EXDEV;
  else
   nd->state |= ND_JUMPED;
}
if (unlikely(ret)) {
  dput(path->dentry);
  if (path->mnt != nd->path.mnt)
   mntput(path->mnt);
}
return ret;
}

/*
* This looks up the name in dcache and possibly revalidates the found dentry.
* NULL is returned if the dentry does not exist in the cache.
*/
static struct dentry *lookup_dcache(const struct qstr *name,
        struct dentry *dir,
        unsigned int flags)
{
struct dentry *dentry = d_lookup(dir, name);
if (dentry) {
  int error = d_revalidate(dir->d_inode, name, dentry, flags);
  if (unlikely(error <= 0)) {
   if (!error)
    d_invalidate(dentry);
   dput(dentry);
   return ERR_PTR(error);
  }
}
return dentry;
}

/*
* Parent directory has inode locked exclusive.  This is one
* and only case when ->lookup() gets called on non in-lookup
* dentries - as the matter of fact, this only gets called
* when directory is guaranteed to have no in-lookup children
* at all.
* Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed.
* Will return -EEXIST if name is found and LOOKUP_EXCL was passed.
*/
struct dentry *lookup_one_qstr_excl(const struct qstr *name,
        struct dentry *base, unsigned int flags)
{
struct dentry *dentry;
struct dentry *old;
struct inode *dir;

dentry = lookup_dcache(name, base, flags);
if (dentry)
  goto found;

/* Don't create child dentry for a dead directory. */
dir = base->d_inode;
if (unlikely(IS_DEADDIR(dir)))
  return ERR_PTR(-ENOENT);

dentry = d_alloc(base, name);
if (unlikely(!dentry))
  return ERR_PTR(-ENOMEM);

old = dir->i_op->lookup(dir, dentry, flags);
if (unlikely(old)) {
  dput(dentry);
  dentry = old;
}
found:
if (IS_ERR(dentry))
  return dentry;
if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) {
  dput(dentry);
  return ERR_PTR(-ENOENT);
}
if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) {
  dput(dentry);
  return ERR_PTR(-EEXIST);
}
return dentry;
}
EXPORT_SYMBOL(lookup_one_qstr_excl);

/**
* lookup_fast - do fast lockless (but racy) lookup of a dentry
* @nd: current nameidata
*
* Do a fast, but racy lookup in the dcache for the given dentry, and
* revalidate it. Returns a valid dentry pointer or NULL if one wasn't
* found. On error, an ERR_PTR will be returned.
*
* If this function returns a valid dentry and the walk is no longer
* lazy, the dentry will carry a reference that must later be put. If
* RCU mode is still in force, then this is not the case and the dentry
* must be legitimized before use. If this returns NULL, then the walk
* will no longer be in RCU mode.
*/
static struct dentry *lookup_fast(struct nameidata *nd)
{
struct dentry *dentry, *parent = nd->path.dentry;
int status = 1;

/*
* Rename seqlock is not required here because in the off chance
* of a false negative due to a concurrent rename, the caller is
* going to fall back to non-racy lookup.
*/
if (nd->flags & LOOKUP_RCU) {
  dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq);
  if (unlikely(!dentry)) {
   if (!try_to_unlazy(nd))
    return ERR_PTR(-ECHILD);
   return NULL;
  }

  /*
* This sequence count validates that the parent had no
* changes while we did the lookup of the dentry above.
*/
  if (read_seqcount_retry(&parent->d_seq, nd->seq))
   return ERR_PTR(-ECHILD);

  status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
  if (likely(status > 0))
   return dentry;
  if (!try_to_unlazy_next(nd, dentry))
   return ERR_PTR(-ECHILD);
  if (status == -ECHILD)
   /* we'd been told to redo it in non-rcu mode */
   status = d_revalidate(nd->inode, &nd->last,
           dentry, nd->flags);
} else {
  dentry = __d_lookup(parent, &nd->last);
  if (unlikely(!dentry))
   return NULL;
  status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags);
}
if (unlikely(status <= 0)) {
  if (!status)
   d_invalidate(dentry);
  dput(dentry);
  return ERR_PTR(status);
}
return dentry;
}

/* Fast lookup failed, do it the slow way */
static struct dentry *__lookup_slow(const struct qstr *name,
        struct dentry *dir,
        unsigned int flags)
{
struct dentry *dentry, *old;
struct inode *inode = dir->d_inode;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

/* Don't go there if it's already dead */
if (unlikely(IS_DEADDIR(inode)))
  return ERR_PTR(-ENOENT);
again:
dentry = d_alloc_parallel(dir, name, &wq);
if (IS_ERR(dentry))
  return dentry;
if (unlikely(!d_in_lookup(dentry))) {
  int error = d_revalidate(inode, name, dentry, flags);
  if (unlikely(error <= 0)) {
   if (!error) {
    d_invalidate(dentry);
    dput(dentry);
    goto again;
   }
   dput(dentry);
   dentry = ERR_PTR(error);
  }
} else {
  old = inode->i_op->lookup(inode, dentry, flags);
  d_lookup_done(dentry);
  if (unlikely(old)) {
   dput(dentry);
   dentry = old;
  }
}
return dentry;
}

static struct dentry *lookup_slow(const struct qstr *name,
      struct dentry *dir,
      unsigned int flags)
{
struct inode *inode = dir->d_inode;
struct dentry *res;
inode_lock_shared(inode);
res = __lookup_slow(name, dir, flags);
inode_unlock_shared(inode);
return res;
}

static inline int may_lookup(struct mnt_idmap *idmap,
        struct nameidata *restrict nd)
{
int err, mask;

mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0;
err = inode_permission(idmap, nd->inode, mask | MAY_EXEC);
if (likely(!err))
  return 0;

// If we failed, and we weren't in LOOKUP_RCU, it's final
if (!(nd->flags & LOOKUP_RCU))
  return err;

// Drop out of RCU mode to make sure it wasn't transient
if (!try_to_unlazy(nd))
  return -ECHILD; // redo it all non-lazy

if (err != -ECHILD) // hard error
  return err;

return inode_permission(idmap, nd->inode, MAY_EXEC);
}

static int reserve_stack(struct nameidata *nd, struct path *link)
{
if (unlikely(nd->total_link_count++ >= MAXSYMLINKS))
  return -ELOOP;

if (likely(nd->depth != EMBEDDED_LEVELS))
  return 0;
if (likely(nd->stack != nd->internal))
  return 0;
if (likely(nd_alloc_stack(nd)))
  return 0;

if (nd->flags & LOOKUP_RCU) {
  // we need to grab link before we do unlazy.  And we can't skip
  // unlazy even if we fail to grab the link - cleanup needs it
  bool grabbed_link = legitimize_path(nd, link, nd->next_seq);

  if (!try_to_unlazy(nd) || !grabbed_link)
   return -ECHILD;

  if (nd_alloc_stack(nd))
   return 0;
}
return -ENOMEM;
}

enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4};

static const char *pick_link(struct nameidata *nd, struct path *link,
       struct inode *inode, int flags)
{
struct saved *last;
const char *res;
int error = reserve_stack(nd, link);

if (unlikely(error)) {
  if (!(nd->flags & LOOKUP_RCU))
   path_put(link);
  return ERR_PTR(error);
}
last = nd->stack + nd->depth++;
last->link = *link;
clear_delayed_call(&last->done);
last->seq = nd->next_seq;

if (flags & WALK_TRAILING) {
  error = may_follow_link(nd, inode);
  if (unlikely(error))
   return ERR_PTR(error);
}

if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) ||
   unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW))
  return ERR_PTR(-ELOOP);

if (unlikely(atime_needs_update(&last->link, inode))) {
  if (nd->flags & LOOKUP_RCU) {
   if (!try_to_unlazy(nd))
    return ERR_PTR(-ECHILD);
  }
  touch_atime(&last->link);
  cond_resched();
}

error = security_inode_follow_link(link->dentry, inode,
        nd->flags & LOOKUP_RCU);
if (unlikely(error))
  return ERR_PTR(error);

res = READ_ONCE(inode->i_link);
if (!res) {
  const char * (*get)(struct dentry *, struct inode *,
    struct delayed_call *);
  get = inode->i_op->get_link;
  if (nd->flags & LOOKUP_RCU) {
   res = get(NULL, inode, &last->done);
   if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd))
    res = get(link->dentry, inode, &last->done);
  } else {
   res = get(link->dentry, inode, &last->done);
  }
  if (!res)
   goto all_done;
  if (IS_ERR(res))
   return res;
}
if (*res == '/') {
  error = nd_jump_root(nd);
  if (unlikely(error))
   return ERR_PTR(error);
  while (unlikely(*++res == '/'))
   ;
}
if (*res)
  return res;
all_done: // pure jump
put_link(nd);
return NULL;
}

/*
* Do we need to follow links? We _really_ want to be able
* to do this check without having to look at inode->i_op,
* so we keep a cache of "no, this doesn't need follow_link"
* for the common case.
*
* NOTE: dentry must be what nd->next_seq had been sampled from.
*/
static const char *step_into(struct nameidata *nd, int flags,
       struct dentry *dentry)
{
struct path path;
struct inode *inode;
int err = handle_mounts(nd, dentry, &path);

if (err < 0)
  return ERR_PTR(err);
inode = path.dentry->d_inode;
if (likely(!d_is_symlink(path.dentry)) ||
    ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) ||
    (flags & WALK_NOFOLLOW)) {
  /* not a symlink or should not follow */
  if (nd->flags & LOOKUP_RCU) {
   if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
    return ERR_PTR(-ECHILD);
   if (unlikely(!inode))
    return ERR_PTR(-ENOENT);
  } else {
   dput(nd->path.dentry);
   if (nd->path.mnt != path.mnt)
    mntput(nd->path.mnt);
  }
  nd->path = path;
  nd->inode = inode;
  nd->seq = nd->next_seq;
  return NULL;
}
if (nd->flags & LOOKUP_RCU) {
  /* make sure that d_is_symlink above matches inode */
  if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq))
   return ERR_PTR(-ECHILD);
} else {
  if (path.mnt == nd->path.mnt)
   mntget(path.mnt);
}
return pick_link(nd, &path, inode, flags);
}

static struct dentry *follow_dotdot_rcu(struct nameidata *nd)
{
struct dentry *parent, *old;

if (path_equal(&nd->path, &nd->root))
  goto in_root;
if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
  struct path path;
  unsigned seq;
  if (!choose_mountpoint_rcu(real_mount(nd->path.mnt),
        &nd->root, &path, &seq))
   goto in_root;
  if (unlikely(nd->flags & LOOKUP_NO_XDEV))
   return ERR_PTR(-ECHILD);
  nd->path = path;
  nd->inode = path.dentry->d_inode;
  nd->seq = seq;
  // makes sure that non-RCU pathwalk could reach this state
  if (read_seqretry(&mount_lock, nd->m_seq))
   return ERR_PTR(-ECHILD);
  /* we know that mountpoint was pinned */
}
old = nd->path.dentry;
parent = old->d_parent;
nd->next_seq = read_seqcount_begin(&parent->d_seq);
// makes sure that non-RCU pathwalk could reach this state
if (read_seqcount_retry(&old->d_seq, nd->seq))
  return ERR_PTR(-ECHILD);
if (unlikely(!path_connected(nd->path.mnt, parent)))
  return ERR_PTR(-ECHILD);
return parent;
in_root:
if (read_seqretry(&mount_lock, nd->m_seq))
  return ERR_PTR(-ECHILD);
if (unlikely(nd->flags & LOOKUP_BENEATH))
  return ERR_PTR(-ECHILD);
nd->next_seq = nd->seq;
return nd->path.dentry;
}

static struct dentry *follow_dotdot(struct nameidata *nd)
{
struct dentry *parent;

if (path_equal(&nd->path, &nd->root))
  goto in_root;
if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) {
  struct path path;

  if (!choose_mountpoint(real_mount(nd->path.mnt),
           &nd->root, &path))
   goto in_root;
  path_put(&nd->path);
  nd->path = path;
  nd->inode = path.dentry->d_inode;
  if (unlikely(nd->flags & LOOKUP_NO_XDEV))
   return ERR_PTR(-EXDEV);
}
/* rare case of legitimate dget_parent()... */
parent = dget_parent(nd->path.dentry);
if (unlikely(!path_connected(nd->path.mnt, parent))) {
  dput(parent);
  return ERR_PTR(-ENOENT);
}
return parent;

in_root:
if (unlikely(nd->flags & LOOKUP_BENEATH))
  return ERR_PTR(-EXDEV);
return dget(nd->path.dentry);
}

static const char *handle_dots(struct nameidata *nd, int type)
{
if (type == LAST_DOTDOT) {
  const char *error = NULL;
  struct dentry *parent;

  if (!nd->root.mnt) {
   error = ERR_PTR(set_root(nd));
   if (error)
    return error;
  }
  if (nd->flags & LOOKUP_RCU)
   parent = follow_dotdot_rcu(nd);
  else
   parent = follow_dotdot(nd);
  if (IS_ERR(parent))
   return ERR_CAST(parent);
  error = step_into(nd, WALK_NOFOLLOW, parent);
  if (unlikely(error))
   return error;

  if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) {
   /*
* If there was a racing rename or mount along our
* path, then we can't be sure that ".." hasn't jumped
* above nd->root (and so userspace should retry or use
* some fallback).
*/
   smp_rmb();
   if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq))
    return ERR_PTR(-EAGAIN);
   if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq))
    return ERR_PTR(-EAGAIN);
  }
}
return NULL;
}

static const char *walk_component(struct nameidata *nd, int flags)
{
struct dentry *dentry;
/*
* "." and ".." are special - ".." especially so because it has
* to be able to know about the current root directory and
* parent relationships.
*/
if (unlikely(nd->last_type != LAST_NORM)) {
  if (!(flags & WALK_MORE) && nd->depth)
   put_link(nd);
  return handle_dots(nd, nd->last_type);
}
dentry = lookup_fast(nd);
if (IS_ERR(dentry))
  return ERR_CAST(dentry);
if (unlikely(!dentry)) {
  dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags);
  if (IS_ERR(dentry))
   return ERR_CAST(dentry);
}
if (!(flags & WALK_MORE) && nd->depth)
  put_link(nd);
return step_into(nd, flags, dentry);
}

/*
* We can do the critical dentry name comparison and hashing
* operations one word at a time, but we are limited to:
*
* - Architectures with fast unaligned word accesses. We could
*   do a "get_unaligned()" if this helps and is sufficiently
*   fast.
*
* - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
*   do not trap on the (extremely unlikely) case of a page
*   crossing operation.
*
* - Furthermore, we need an efficient 64-bit compile for the
*   64-bit case in order to generate the "number of bytes in
*   the final mask". Again, that could be replaced with a
*   efficient population count instruction or similar.
*/
#ifdef CONFIG_DCACHE_WORD_ACCESS

#include <asm/word-at-a-time.h>

#ifdef HASH_MIX

/* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */

#elif defined(CONFIG_64BIT)
/*
* Register pressure in the mixing function is an issue, particularly
* on 32-bit x86, but almost any function requires one state value and
* one temporary.  Instead, use a function designed for two state values
* and no temporaries.
*
* This function cannot create a collision in only two iterations, so
* we have two iterations to achieve avalanche.  In those two iterations,
* we have six layers of mixing, which is enough to spread one bit's
* influence out to 2^6 = 64 state bits.
*
* Rotate constants are scored by considering either 64 one-bit input
* deltas or 64*63/2 = 2016 two-bit input deltas, and finding the
* probability of that delta causing a change to each of the 128 output
* bits, using a sample of random initial states.
*
* The Shannon entropy of the computed probabilities is then summed
* to produce a score.  Ideally, any input change has a 50% chance of
* toggling any given output bit.
*
* Mixing scores (in bits) for (12,45):
* Input delta: 1-bit      2-bit
* 1 round:     713.3    42542.6
* 2 rounds:   2753.7   140389.8
* 3 rounds:   5954.1   233458.2
* 4 rounds:   7862.6   256672.2
* Perfect:    8192     258048
*            (64*128) (64*63/2 * 128)
*/
#define HASH_MIX(x, y, a) \
( x ^= (a), \
y ^= x, x = rol64(x,12),\
x += y, y = rol64(y,45),\
y *= 9   )

/*
* Fold two longs into one 32-bit hash value.  This must be fast, but
* latency isn't quite as critical, as there is a fair bit of additional
* work done before the hash value is used.
*/
static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
y ^= x * GOLDEN_RATIO_64;
y *= GOLDEN_RATIO_64;
return y >> 32;
}

#else /* 32-bit case */

/*
* Mixing scores (in bits) for (7,20):
* Input delta: 1-bit      2-bit
* 1 round:     330.3     9201.6
* 2 rounds:   1246.4    25475.4
* 3 rounds:   1907.1    31295.1
* 4 rounds:   2042.3    31718.6
* Perfect:    2048      31744
*            (32*64)   (32*31/2 * 64)
*/
#define HASH_MIX(x, y, a) \
( x ^= (a), \
y ^= x, x = rol32(x, 7),\
x += y, y = rol32(y,20),\
y *= 9   )

static inline unsigned int fold_hash(unsigned long x, unsigned long y)
{
/* Use arch-optimized multiply if one exists */
return __hash_32(y ^ __hash_32(x));
}

#endif

/*
* Return the hash of a string of known length.  This is carfully
* designed to match hash_name(), which is the more critical function.
* In particular, we must end by hashing a final word containing 0..7
* payload bytes, to match the way that hash_name() iterates until it
* finds the delimiter after the name.
*/
unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
unsigned long a, x = 0, y = (unsigned long)salt;

for (;;) {
  if (!len)
   goto done;
  a = load_unaligned_zeropad(name);
  if (len < sizeof(unsigned long))
   break;
  HASH_MIX(x, y, a);
  name += sizeof(unsigned long);
  len -= sizeof(unsigned long);
}
x ^= a & bytemask_from_count(len);
done:
return fold_hash(x, y);
}
EXPORT_SYMBOL(full_name_hash);

/* Return the "hash_len" (hash and length) of a null-terminated string */
u64 hashlen_string(const void *salt, const char *name)
{
unsigned long a = 0, x = 0, y = (unsigned long)salt;
unsigned long adata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

len = 0;
goto inside;

do {
  HASH_MIX(x, y, a);
  len += sizeof(unsigned long);
inside:
  a = load_unaligned_zeropad(name+len);
} while (!has_zero(a, &adata, &constants));

adata = prep_zero_mask(a, adata, &constants);
mask = create_zero_mask(adata);
x ^= a & zero_bytemask(mask);

return hashlen_create(fold_hash(x, y), len + find_zero(mask));
}
EXPORT_SYMBOL(hashlen_string);

/*
* Calculate the length and hash of the path component, and
* return the length as the result.
*/
static inline const char *hash_name(struct nameidata *nd,
        const char *name,
        unsigned long *lastword)
{
unsigned long a, b, x, y = (unsigned long)nd->path.dentry;
unsigned long adata, bdata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;

/*
* The first iteration is special, because it can result in
* '.' and '..' and has no mixing other than the final fold.
*/
a = load_unaligned_zeropad(name);
b = a ^ REPEAT_BYTE('/');
if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) {
  adata = prep_zero_mask(a, adata, &constants);
  bdata = prep_zero_mask(b, bdata, &constants);
  mask = create_zero_mask(adata | bdata);
  a &= zero_bytemask(mask);
  *lastword = a;
  len = find_zero(mask);
  nd->last.hash = fold_hash(a, y);
  nd->last.len = len;
  return name + len;
}

len = 0;
x = 0;
do {
  HASH_MIX(x, y, a);
  len += sizeof(unsigned long);
  a = load_unaligned_zeropad(name+len);
  b = a ^ REPEAT_BYTE('/');
} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));

adata = prep_zero_mask(a, adata, &constants);
bdata = prep_zero_mask(b, bdata, &constants);
mask = create_zero_mask(adata | bdata);
a &= zero_bytemask(mask);
x ^= a;
len += find_zero(mask);
*lastword = 0;  // Multi-word components cannot be DOT or DOTDOT

nd->last.hash = fold_hash(x, y);
nd->last.len = len;
return name + len;
}

/*
* Note that the 'last' word is always zero-masked, but
* was loaded as a possibly big-endian word.
*/
#ifdef __BIG_ENDIAN
  #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8))
  #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16))
#endif

#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */

/* Return the hash of a string of known length */
--> --------------------

--> maximum size reached

--> --------------------

quality90%

¤ Dauer der Verarbeitung: 0.88 Sekunden (vorverarbeitet) ¤

Wurzel

Suchen

Beweissystem der NASA

Beweissystem Isabelle

NIST Cobol Testsuite

Cephes Mathematical Library

Wiener Entwicklungsmethode

Haftungshinweis

Die Informationen auf dieser Webseite wurden nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit, noch Qualität der bereit gestellten Informationen zugesichert.

Bemerkung:

Die farbliche Syntaxdarstellung ist noch experimentell.