/* * Legacy fanotify marks limits (8192) is per group and we introduced a tunable * limit of marks per user, similar to inotify. Effectively, the legacy limit * of fanotify marks per user is <max marks per group> * <max groups per user>. * This default limit (1M) also happens to match the increased limit of inotify * max_user_watches since v5.10.
*/ #define FANOTIFY_DEFAULT_MAX_USER_MARKS \
(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
/* * Most of the memory cost of adding an inode mark is pinning the marked inode. * The size of the filesystem inode struct is not uniform across filesystems, * so double the size of a VFS inode is used as a conservative approximation.
*/ #define INODE_MARK_COST (2 * sizeof(struct inode))
/* configurable via /proc/sys/fs/fanotify/ */ staticint fanotify_max_queued_events __read_mostly;
staticvoid __init fanotify_sysctls_init(void)
{
register_sysctl("fs/fanotify", fanotify_table);
} #else #define fanotify_sysctls_init() do { } while (0) #endif/* CONFIG_SYSCTL */
/* * All flags that may be specified in parameter event_f_flags of fanotify_init. * * Internal and external open flags are stored together in field f_flags of * struct file. Only external open flags shall be allowed in event_f_flags. * Internal flags like FMODE_EXEC shall be excluded.
*/ #define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \
O_ACCMODE | O_APPEND | O_NONBLOCK | \
__O_SYNC | O_DSYNC | O_CLOEXEC | \
O_LARGEFILE | O_NOATIME )
/* FAN_RENAME may have one or two dir+name info records */ staticint fanotify_dir_name_info_len(struct fanotify_event *event)
{ struct fanotify_info *info = fanotify_event_info(event); int dir_fh_len = fanotify_event_dir_fh_len(event); int dir2_fh_len = fanotify_event_dir2_fh_len(event); int info_len = 0;
if (dir_fh_len)
info_len += fanotify_fid_info_len(dir_fh_len,
info->name_len); if (dir2_fh_len)
info_len += fanotify_fid_info_len(dir2_fh_len,
info->name2_len);
return info_len;
}
static size_t fanotify_event_len(unsignedint info_mode, struct fanotify_event *event)
{
size_t event_len = FAN_EVENT_METADATA_LEN; int fh_len; int dot_len = 0;
if (fanotify_is_error_event(event->mask))
event_len += FANOTIFY_ERROR_INFO_LEN;
if (fanotify_event_has_any_dir_fh(event)) {
event_len += fanotify_dir_name_info_len(event);
} elseif ((info_mode & FAN_REPORT_NAME) &&
(event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_NAME, if name was not recorded in * event on a directory, we will report the name ".".
*/
dot_len = 1;
}
if (fanotify_event_has_object_fh(event)) {
fh_len = fanotify_event_object_fh_len(event);
event_len += fanotify_fid_info_len(fh_len, dot_len);
} if (fanotify_is_mnt_event(event->mask))
event_len += FANOTIFY_MNT_INFO_LEN;
if (info_mode & FAN_REPORT_PIDFD)
event_len += FANOTIFY_PIDFD_INFO_LEN;
if (fanotify_event_has_access_range(event))
event_len += FANOTIFY_RANGE_INFO_LEN;
return event_len;
}
/* * Remove an hashed event from merge hash table.
*/ staticvoid fanotify_unhash_event(struct fsnotify_group *group, struct fanotify_event *event)
{
assert_spin_locked(&group->notification_lock);
if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list))) return;
hlist_del_init(&event->merge_list);
}
/* * Get an fanotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count * is not large enough. When permission event is dequeued, its state is * updated accordingly.
*/ staticstruct fanotify_event *get_one_event(struct fsnotify_group *group,
size_t count)
{
size_t event_size; struct fanotify_event *event = NULL; struct fsnotify_event *fsn_event; unsignedint info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
/* * Held the notification_lock the whole time, so this is the * same event we peeked above.
*/
fsnotify_remove_first_event(group); if (fanotify_is_perm_event(event->mask))
FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED; if (fanotify_is_hashed_event(event->mask))
fanotify_unhash_event(group, event);
out:
spin_unlock(&group->notification_lock); return event;
}
client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); if (client_fd < 0) return client_fd;
/* * We provide an fd for the userspace program, so it could access the * file without generating fanotify events itself.
*/
new_file = dentry_open_nonotify(path, group->fanotify_data.f_flags,
current_cred()); if (IS_ERR(new_file)) {
put_unused_fd(client_fd);
client_fd = PTR_ERR(new_file);
} else {
*file = new_file;
}
if (event->state == FAN_EVENT_CANCELED)
destroy = true; else
event->state = FAN_EVENT_ANSWERED;
spin_unlock(&group->notification_lock); if (destroy)
fsnotify_destroy_event(group, &event->fae.fse);
}
staticint process_access_response(struct fsnotify_group *group, struct fanotify_response *response_struct, constchar __user *info,
size_t info_len)
{ struct fanotify_perm_event *event; int fd = response_struct->fd;
u32 response = response_struct->response; int errno = fanotify_get_response_errno(response); int ret = info_len; struct fanotify_response_info_audit_rule friar;
pr_debug("%s: group=%p fd=%d response=%x errno=%d buf=%p size=%zu\n",
__func__, group, fd, response, errno, info, info_len); /* * make sure the response is valid, if invalid we do nothing and either * userspace can send a valid response or we will clean it up after the * timeout
*/ if (response & ~FANOTIFY_RESPONSE_VALID_MASK) return -EINVAL;
switch (response & FANOTIFY_RESPONSE_ACCESS) { case FAN_ALLOW: if (errno) return -EINVAL; break; case FAN_DENY: /* Custom errno is supported only for pre-content groups */ if (errno && group->priority != FSNOTIFY_PRIO_PRE_CONTENT) return -EINVAL;
/* * Limit errno to values expected on open(2)/read(2)/write(2) * of regular files.
*/ switch (errno) { case 0: case EIO: case EPERM: case EBUSY: case ETXTBSY: case EAGAIN: case ENOSPC: case EDQUOT: break; default: return -EINVAL;
} break; default: return -EINVAL;
}
if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT)) return -EINVAL;
if (response & FAN_INFO) {
ret = process_access_response_info(info, info_len, &friar); if (ret < 0) return ret; if (fd == FAN_NOFD) return ret;
} else {
ret = 0;
}
if (fd < 0) return -EINVAL;
spin_lock(&group->notification_lock);
list_for_each_entry(event, &group->fanotify_data.access_list,
fae.fse.list) { if (event->fd != fd) continue;
if (WARN_ON_ONCE(len < sizeof(info) || len > count)) return -EFAULT;
/* * Copy event info fid header followed by variable sized file handle * and optionally followed by variable sized filename.
*/ switch (info_type) { case FAN_EVENT_INFO_TYPE_FID: case FAN_EVENT_INFO_TYPE_DFID: if (WARN_ON_ONCE(name_len)) return -EFAULT; break; case FAN_EVENT_INFO_TYPE_DFID_NAME: case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME: case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME: if (WARN_ON_ONCE(!name || !name_len)) return -EFAULT; break; default: return -EFAULT;
}
/* Mangle handle_type for bad file_handle */ if (!fh_len)
handle.handle_type = FILEID_INVALID;
if (copy_to_user(buf, &handle, sizeof(handle))) return -EFAULT;
buf += sizeof(handle);
len -= sizeof(handle); if (WARN_ON_ONCE(len < fh_len)) return -EFAULT;
/* * For an inline fh and inline file name, copy through stack to exclude * the copy from usercopy hardening protections.
*/
fh_buf = fanotify_fh_buf(fh); if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
memcpy(bounce, fh_buf, fh_len);
fh_buf = bounce;
} if (copy_to_user(buf, fh_buf, fh_len)) return -EFAULT;
buf += fh_len;
len -= fh_len;
if (name_len) { /* Copy the filename with terminating null */
name_len++; if (WARN_ON_ONCE(len < name_len)) return -EFAULT;
if (copy_to_user(buf, name, name_len)) return -EFAULT;
buf += name_len;
len -= name_len;
}
/* Pad with 0's */
WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); if (len > 0 && clear_user(buf, len)) return -EFAULT;
/* * Event info records order is as follows: * 1. dir fid + name * 2. (optional) new dir fid + new name * 3. (optional) child fid
*/ if (fanotify_event_has_dir_fh(event)) {
info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
FAN_EVENT_INFO_TYPE_DFID;
/* FAN_RENAME uses special info types */ if (event->mask & FAN_RENAME)
info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
ret = copy_fid_info_to_user(fanotify_event_fsid(event),
fanotify_info_dir_fh(info),
info_type,
fanotify_info_name(info),
info->name_len, buf, count); if (ret < 0) return ret;
buf += ret;
count -= ret;
total_bytes += ret;
}
/* New dir fid+name may be reported in addition to old dir fid+name */ if (fanotify_event_has_dir2_fh(event)) {
info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
ret = copy_fid_info_to_user(fanotify_event_fsid(event),
fanotify_info_dir2_fh(info),
info_type,
fanotify_info_name2(info),
info->name2_len, buf, count); if (ret < 0) return ret;
buf += ret;
count -= ret;
total_bytes += ret;
}
if (fanotify_event_has_object_fh(event)) { constchar *dot = NULL; int dot_len = 0;
if (fid_mode == FAN_REPORT_FID || info_type) { /* * With only group flag FAN_REPORT_FID only type FID is * reported. Second info record type is always FID.
*/
info_type = FAN_EVENT_INFO_TYPE_FID;
} elseif ((fid_mode & FAN_REPORT_NAME) &&
(event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_NAME, if name was not * recorded in an event on a directory, report the name * "." with info type DFID_NAME.
*/
info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
dot = ".";
dot_len = 1;
} elseif ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
(event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_DIR_FID, a single info * record has type DFID for directory entry modification * event and for event on a directory.
*/
info_type = FAN_EVENT_INFO_TYPE_DFID;
} else { /* * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, * a single info record has type FID for event on a * non-directory, when there is no directory to report. * For example, on FAN_DELETE_SELF event.
*/
info_type = FAN_EVENT_INFO_TYPE_FID;
}
ret = copy_fid_info_to_user(fanotify_event_fsid(event),
fanotify_event_object_fh(event),
info_type, dot, dot_len,
buf, count); if (ret < 0) return ret;
buf += ret;
count -= ret;
total_bytes += ret;
}
if (pidfd_mode) {
ret = copy_pidfd_info_to_user(pidfd, buf, count); if (ret < 0) return ret;
buf += ret;
count -= ret;
total_bytes += ret;
}
if (fanotify_is_error_event(event->mask)) {
ret = copy_error_info_to_user(event, buf, count); if (ret < 0) return ret;
buf += ret;
count -= ret;
total_bytes += ret;
}
if (fanotify_event_has_access_range(event)) {
ret = copy_range_info_to_user(event, buf, count); if (ret < 0) return ret;
buf += ret;
count -= ret;
total_bytes += ret;
}
if (fanotify_is_mnt_event(event->mask)) {
ret = copy_mnt_info_to_user(event, buf, count); if (ret < 0) return ret;
buf += ret;
count -= ret;
total_bytes += ret;
}
metadata.event_len = fanotify_event_len(info_mode, event);
metadata.metadata_len = FAN_EVENT_METADATA_LEN;
metadata.vers = FANOTIFY_METADATA_VERSION;
metadata.reserved = 0;
metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
metadata.pid = pid_vnr(event->pid); /* * For an unprivileged listener, event->pid can be used to identify the * events generated by the listener process itself, without disclosing * the pids of other processes.
*/ if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
task_tgid(current) != event->pid)
metadata.pid = 0;
/* * For now, fid mode is required for an unprivileged listener and * fid mode does not report fd in events. Keep this check anyway * for safety in case fid mode requirement is relaxed in the future * to allow unprivileged listener to get events with no fd and no fid.
*/ if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
path && path->mnt && path->dentry) {
fd = create_fd(group, path, &f); /* * Opening an fd from dentry can fail for several reasons. * For example, when tasks are gone and we try to open their * /proc files or we try to open a WRONLY file like in sysfs * or when trying to open a file that was deleted on the * remote network server. * * For a group with FAN_REPORT_FD_ERROR, we will send the * event with the error instead of the open fd, otherwise * Userspace may not get the error at all. * In any case, userspace will not know which file failed to * open, so add a debug print for further investigation.
*/ if (fd < 0) {
pr_debug("fanotify: create_fd(%pd2) failed err=%d\n",
path->dentry, fd); if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) { /* * Historically, we've handled EOPENSTALE in a * special way and silently dropped such * events. Now we have to keep it to maintain * backward compatibility...
*/ if (fd == -EOPENSTALE)
fd = 0; return fd;
}
}
} if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR))
metadata.fd = fd; else
metadata.fd = fd >= 0 ? fd : FAN_NOFD;
if (pidfd_mode) { /* * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual * exclusion is ever lifted. At the time of incoporating pidfd * support within fanotify, the pidfd API only supported the * creation of pidfds for thread-group leaders.
*/
WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
/* * The PIDTYPE_TGID check for an event->pid is performed * preemptively in an attempt to catch out cases where the event * listener reads events after the event generating process has * already terminated. Depending on flag FAN_REPORT_FD_ERROR, * report either -ESRCH or FAN_NOPIDFD to the event listener in * those cases with all other pidfd creation errors reported as * the error code itself or as FAN_EPIDFD.
*/ if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID))
pidfd = pidfd_prepare(event->pid, 0, &pidfd_file);
ret = -EFAULT; /* * Sanity check copy size in case get_one_event() and * event_len sizes ever get out of sync.
*/ if (WARN_ON_ONCE(metadata.event_len > count)) goto out_close_fd;
if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) goto out_close_fd;
add_wait_queue(&group->notification_waitq, &wait); while (1) { /* * User can supply arbitrarily large buffer. Avoid softlockups * in case there are lots of available events.
*/
cond_resched();
event = get_one_event(group, count); if (IS_ERR(event)) {
ret = PTR_ERR(event); break;
}
if (!event) {
ret = -EAGAIN; if (file->f_flags & O_NONBLOCK) break;
ret = -ERESTARTSYS; if (signal_pending(current)) break;
/* * Stop new events from arriving in the notification queue. since * userspace cannot use fanotify fd anymore, no event can enter or * leave access_list by now either.
*/
fsnotify_group_stop_queueing(group);
/* * Process all permission events on access_list and notification queue * and simulate reply from userspace.
*/
spin_lock(&group->notification_lock); while (!list_empty(&group->fanotify_data.access_list)) { struct fanotify_perm_event *event;
/* * Destroy all non-permission events. For permission events just * dequeue them and set the response. They will be freed once the * response is consumed and fanotify_get_response() returns.
*/ while ((fsn_event = fsnotify_remove_first_event(group))) { struct fanotify_event *event = FANOTIFY_E(fsn_event);
/* umask bits cannot be removed by user */
mask &= ~umask;
spin_lock(&fsn_mark->lock);
oldmask = fsnotify_calc_mask(fsn_mark); if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
fsn_mark->mask &= ~mask;
} else {
fsn_mark->ignore_mask &= ~mask;
}
newmask = fsnotify_calc_mask(fsn_mark); /* * We need to keep the mark around even if remaining mask cannot * result in any events (e.g. mask == FAN_ONDIR) to support incremenal * changes to the mask. * Destroy mark when only umask bits remain.
*/
*destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask);
spin_unlock(&fsn_mark->lock);
/* * When using FAN_MARK_IGNORE for the first time, mark starts using * independent event flags in ignore mask. After that, trying to * update the ignore mask with the old FAN_MARK_IGNORED_MASK API * will result in EEXIST error.
*/ if (ignore == FAN_MARK_IGNORE)
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
/* * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to * the removal of the FS_MODIFY bit in calculated mask if it was set * because of an ignore mask that is now going to survive FS_MODIFY.
*/ if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
!(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; if (!(fsn_mark->mask & FS_MODIFY))
recalc = true;
}
/* * NO_IREF may be removed from a mark, but not added. * When removed, fsnotify_recalc_mask() will take the inode ref.
*/
WARN_ON_ONCE(!want_iref);
fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
/* First mark added will determine if group is single or multi fsid */ if (list_empty(&group->marks_list)) return 0;
/* Find sb of an existing mark */
list_for_each_entry(old, &group->marks_list, g_list) {
conn = READ_ONCE(old->connector); if (!conn) continue;
old_sb = fsnotify_connector_sb(conn); if (old_sb) break;
}
/* Only detached marks left? */ if (!old_sb) return 0;
/* Do not allow mixing of marks with weak and strong fsid */ if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID) return -EXDEV;
/* Allow mixing of marks with strong fsid from different fs */ if (!fsid->weak) return 0;
/* Do not allow mixing marks with weak fsid from different fs */ if (old_sb != fsid->sb) return -EXDEV;
/* Do not allow mixing marks from different btrfs sub-volumes */ if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid,
&FANOTIFY_MARK(mark)->fsid)) return -EXDEV;
/* * Enforce per user marks limits per user in all containing user ns. * A group with FAN_UNLIMITED_MARKS does not contribute to mark count * in the limited groups account.
*/
BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_MARKS)); if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
!inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC);
fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL); if (!fan_mark) {
ret = -ENOMEM; goto out_dec_ucounts;
}
mark = &fan_mark->fsn_mark;
fsnotify_init_mark(mark, group); if (fan_flags & FAN_MARK_EVICTABLE)
mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
/* Cache fsid of filesystem containing the marked object */ if (fsid) {
ret = fanotify_set_mark_fsid(group, mark, fsid); if (ret) goto out_put_mark;
} else {
fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
}
ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0); if (ret) goto out_put_mark;
staticint fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
__u32 mask, unsignedint fan_flags)
{ /* * Non evictable mark cannot be downgraded to evictable mark.
*/ if (fan_flags & FAN_MARK_EVICTABLE &&
!(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)) return -EEXIST;
/* * New ignore mask semantics cannot be downgraded to old semantics.
*/ if (fan_flags & FAN_MARK_IGNORED_MASK &&
fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS) return -EEXIST;
/* * An ignore mask that survives modify could never be downgraded to not * survive modify. With new FAN_MARK_IGNORE semantics we make that rule * explicit and return an error when trying to update the ignore mask * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
*/ if (fan_flags & FAN_MARK_IGNORE &&
!(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) return -EEXIST;
/* For now pre-content events are not generated for directories */
mask |= fsn_mark->mask; if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) return -EEXIST;
/* * Check if requested mark flags conflict with an existing mark flags.
*/
ret = fanotify_may_update_existing_mark(fsn_mark, mask, fan_flags); if (ret) goto out;
/* * Error events are pre-allocated per group, only if strictly * needed (i.e. FAN_FS_ERROR was requested).
*/ if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
(mask & FAN_FS_ERROR)) {
ret = fanotify_group_init_error_pool(group); if (ret) goto out;
}
recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags); if (recalc)
fsnotify_recalc_mask(fsn_mark->connector);
if (!capable(CAP_SYS_ADMIN)) { /* * An unprivileged user can setup an fanotify group with * limited functionality - an unprivileged group is limited to * notification events with file handles or mount ids and it * cannot use unlimited queue/marks.
*/ if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) ||
!(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT))) return -EPERM;
/* * Setting the internal flag FANOTIFY_UNPRIV on the group * prevents setting mount/filesystem marks on this group and * prevents reporting pid and open fd in events.
*/
internal_flags |= FANOTIFY_UNPRIV;
}
#ifdef CONFIG_AUDITSYSCALL if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT)) #else if (flags & ~FANOTIFY_INIT_FLAGS) #endif return -EINVAL;
/* * A pidfd can only be returned for a thread-group leader; thus * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually * exclusive.
*/ if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) return -EINVAL;
/* Don't allow mixing mnt events with inode events for now */ if (flags & FAN_REPORT_MNT) { if (class != FAN_CLASS_NOTIF) return -EINVAL; if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR)) return -EINVAL;
}
if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL;
switch (event_f_flags & O_ACCMODE) { case O_RDONLY: case O_RDWR: case O_WRONLY: break; default: return -EINVAL;
}
if (fid_mode && class != FAN_CLASS_NOTIF) return -EINVAL;
/* * Child name is reported with parent fid so requires dir fid. * We can report both child fid and dir fid with or without name.
*/ if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID)) return -EINVAL;
/* * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID * and is used as an indication to report both dir and child fid on all * dirent events.
*/ if ((fid_mode & FAN_REPORT_TARGET_FID) &&
(!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID))) return -EINVAL;
f_flags = O_RDWR; if (flags & FAN_CLOEXEC)
f_flags |= O_CLOEXEC; if (flags & FAN_NONBLOCK)
f_flags |= O_NONBLOCK;
/* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
FSNOTIFY_GROUP_USER); if (IS_ERR(group)) { return PTR_ERR(group);
}
/* Enforce groups limits per user in all containing user ns */
group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(),
UCOUNT_FANOTIFY_GROUPS); if (!group->fanotify_data.ucounts) {
fd = -EMFILE; goto out_destroy_group;
}
/* * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) * which uses a different fsid than sb root.
*/
err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid); if (err) return err;
if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) {
err = -EXDEV; goto weak;
}
/* Check if filesystem can encode a unique fid */ staticint fanotify_test_fid(struct dentry *dentry, unsignedint flags)
{ unsignedint mark_type = flags & FANOTIFY_MARK_TYPE_BITS; conststruct export_operations *nop = dentry->d_sb->s_export_op;
/* * We need to make sure that the filesystem supports encoding of * file handles so user can use name_to_handle_at() to compare fids * reported with events to the file handle of watched objects.
*/ if (!exportfs_can_encode_fid(nop)) return -EOPNOTSUPP;
/* * For sb/mount mark, we also need to make sure that the filesystem * supports decoding file handles, so user has a way to map back the * reported fids to filesystem objects.
*/ if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop)) return -EOPNOTSUPP;
/* * Filesystems need to opt-into pre-content evnets (a.k.a HSM) * and they are only supported on regular files and directories.
*/ if (mask & FANOTIFY_PRE_CONTENT_EVENTS) { if (!(path->mnt->mnt_sb->s_iflags & SB_I_ALLOW_HSM)) return -EOPNOTSUPP; if (!is_dir && !d_is_reg(path->dentry)) return -EINVAL;
}
/* * Some filesystems such as 'proc' acquire unusual locks when opening * files. For them fanotify permission events have high chances of * deadlocking the system - open done when reporting fanotify event * blocks on this "unusual" lock while another process holding the lock * waits for fanotify permission event to be answered. Just disallow * permission events for such filesystems.
*/ if (mask & FANOTIFY_PERM_EVENTS &&
path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM) return -EINVAL;
/* * mount and sb marks are not allowed on kernel internal pseudo fs, * like pipe_mnt, because that would subscribe to events on all the * anonynous pipes in the system. * * SB_NOUSER covers all of the internal pseudo fs whose objects are not * exposed to user's mount namespace, but there are other SB_KERNMOUNT * fs, like nsfs, debugfs, for which the value of allowing sb and mount * mark is questionable. For now we leave them alone.
*/ if (mark_type != FAN_MARK_INODE &&
path->mnt->mnt_sb->s_flags & SB_NOUSER) return -EINVAL;
/* * We shouldn't have allowed setting dirent events and the directory * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode, * but because we always allowed it, error only when using new APIs.
*/ if (strict_dir_events && mark_type == FAN_MARK_INODE &&
!is_dir && (mask & FANOTIFY_DIRONLY_EVENT_BITS)) return -ENOTDIR;
/* we only use the lower 32 bits as of right now. */ if (upper_32_bits(mask)) return -EINVAL;
if (flags & ~FANOTIFY_MARK_FLAGS) return -EINVAL;
switch (mark_type) { case FAN_MARK_INODE:
obj_type = FSNOTIFY_OBJ_TYPE_INODE; break; case FAN_MARK_MOUNT:
obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT; break; case FAN_MARK_FILESYSTEM:
obj_type = FSNOTIFY_OBJ_TYPE_SB; break; case FAN_MARK_MNTNS:
obj_type = FSNOTIFY_OBJ_TYPE_MNTNS; break; default: return -EINVAL;
}
switch (mark_cmd) { case FAN_MARK_ADD: case FAN_MARK_REMOVE: if (!mask) return -EINVAL; break; case FAN_MARK_FLUSH: if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH)) return -EINVAL; break; default: return -EINVAL;
}
if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
valid_mask |= FANOTIFY_PERM_EVENTS;
if (mask & ~valid_mask) return -EINVAL;
/* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */ if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK)) return -EINVAL;
/* * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with * FAN_MARK_IGNORED_MASK.
*/ if (ignore == FAN_MARK_IGNORED_MASK) {
mask &= ~FANOTIFY_EVENT_FLAGS;
umask = FANOTIFY_EVENT_FLAGS;
}
CLASS(fd, f)(fanotify_fd); if (fd_empty(f)) return -EBADF;
/* verify that this is indeed an fanotify instance */ if (unlikely(fd_file(f)->f_op != &fanotify_fops)) return -EINVAL;
group = fd_file(f)->private_data;
/* Only report mount events on mnt namespace */ if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { if (mask & ~FANOTIFY_MOUNT_EVENTS) return -EINVAL; if (mark_type != FAN_MARK_MNTNS) return -EINVAL;
} else { if (mask & FANOTIFY_MOUNT_EVENTS) return -EINVAL; if (mark_type == FAN_MARK_MNTNS) return -EINVAL;
}
/* * A user is allowed to setup sb/mount/mntns marks only if it is * capable in the user ns where the group was created.
*/ if (!ns_capable(group->user_ns, CAP_SYS_ADMIN) &&
mark_type != FAN_MARK_INODE) return -EPERM;
/* * Permission events are not allowed for FAN_CLASS_NOTIF. * Pre-content permission events are not allowed for FAN_CLASS_CONTENT.
*/ if (mask & FANOTIFY_PERM_EVENTS &&
group->priority == FSNOTIFY_PRIO_NORMAL) return -EINVAL; elseif (mask & FANOTIFY_PRE_CONTENT_EVENTS &&
group->priority == FSNOTIFY_PRIO_CONTENT) return -EINVAL;
if (mask & FAN_FS_ERROR &&
mark_type != FAN_MARK_FILESYSTEM) return -EINVAL;
/* * Evictable is only relevant for inode marks, because only inode object * can be evicted on memory pressure.
*/ if (flags & FAN_MARK_EVICTABLE &&
mark_type != FAN_MARK_INODE) return -EINVAL;
/* * Events that do not carry enough information to report * event->fd require a group that supports reporting fid. Those * events are not supported on a mount mark, because they do not * carry enough information (i.e. path) to be filtered by mount * point.
*/
fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) &&
(!fid_mode || mark_type == FAN_MARK_MOUNT)) return -EINVAL;
/* * FAN_RENAME uses special info type records to report the old and * new parent+name. Reporting only old and new parent id is less * useful and was not implemented.
*/ if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME)) return -EINVAL;
/* Pre-content events are not currently generated for directories. */ if (mask & FANOTIFY_PRE_CONTENT_EVENTS && mask & FAN_ONDIR) return -EINVAL;
if (mark_cmd == FAN_MARK_FLUSH) {
fsnotify_clear_marks_by_group(group, obj_type); return 0;
}
ret = fanotify_find_path(dfd, pathname, &path, flags,
(mask & ALL_FSNOTIFY_EVENTS), obj_type); if (ret) return ret;
if (mark_cmd == FAN_MARK_ADD) {
ret = fanotify_events_supported(group, &path, mask, flags); if (ret) goto path_put_and_out;
}
if (fid_mode) {
ret = fanotify_test_fsid(path.dentry, flags, &__fsid); if (ret) goto path_put_and_out;
ret = fanotify_test_fid(path.dentry, flags); if (ret) goto path_put_and_out;
fsid = &__fsid;
}
/* * In addition to being capable in the user ns where group was created, * the user also needs to be capable in the user ns associated with * the filesystem or in the user ns associated with the mntns * (when marking mntns).
*/ if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = path.dentry->d_inode;
obj = inode;
} elseif (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
user_ns = path.mnt->mnt_sb->s_user_ns;
obj = path.mnt;
} elseif (obj_type == FSNOTIFY_OBJ_TYPE_SB) {
user_ns = path.mnt->mnt_sb->s_user_ns;
obj = path.mnt->mnt_sb;
} elseif (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
ret = -EINVAL;
mntns = mnt_ns_from_dentry(path.dentry); if (!mntns) goto path_put_and_out;
user_ns = mntns->user_ns;
obj = mntns;
}
ret = -EPERM; if (user_ns && !ns_capable(user_ns, CAP_SYS_ADMIN)) goto path_put_and_out;
ret = -EINVAL; if (!obj) goto path_put_and_out;
/* * If some other task has this inode open for write we should not add * an ignore mask, unless that ignore mask is supposed to survive * modification changes anyway.
*/ if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
!(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
ret = !inode ? -EINVAL : -EISDIR; /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ if (ignore == FAN_MARK_IGNORE &&
(!inode || S_ISDIR(inode->i_mode))) goto path_put_and_out;
ret = 0; if (inode && inode_is_open_for_write(inode)) goto path_put_and_out;
}
/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ if (!inode || !S_ISDIR(inode->i_mode)) {
mask &= ~FAN_EVENT_ON_CHILD;
umask = FAN_EVENT_ON_CHILD; /* * If group needs to report parent fid, register for getting * events with parent/name info for non-directory.
*/ if ((fid_mode & FAN_REPORT_DIR_FID) &&
(flags & FAN_MARK_ADD) && !ignore)
mask |= FAN_EVENT_ON_CHILD;
}
/* create/update an inode mark */ switch (mark_cmd) { case FAN_MARK_ADD:
ret = fanotify_add_mark(group, obj, obj_type, mask, flags,
fsid); break; case FAN_MARK_REMOVE:
ret = fanotify_remove_mark(group, obj, obj_type, mask, flags,
umask); break; default:
ret = -EINVAL;
}
/* * fanotify_user_setup - Our initialization function. Note that we cannot return * error because we have compiled-in VFS hooks. So an (unlikely) failure here * must result in panic().
*/ staticint __init fanotify_user_setup(void)
{ struct sysinfo si; int max_marks;
si_meminfo(&si); /* * Allow up to 1% of addressable memory to be accounted for per user * marks limited to the range [8192, 1048576]. mount and sb marks are * a lot cheaper than inode marks, but there is no reason for a user * to have many of those, so calculate by the cost of inode marks.
*/
max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
INODE_MARK_COST;
max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
FANOTIFY_DEFAULT_MAX_USER_MARKS);
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.