#include "pnode.h"
#include "internal.h"
+/* Maximum number of mounts in a mount namespace */
+unsigned int sysctl_mount_max __read_mostly = 100000;
+
static unsigned int m_hash_mask __read_mostly;
static unsigned int m_hash_shift __read_mostly;
static unsigned int mp_hash_mask __read_mostly;
INIT_LIST_HEAD(&mnt->mnt_slave_list);
INIT_LIST_HEAD(&mnt->mnt_slave);
INIT_HLIST_NODE(&mnt->mnt_mp_list);
+ INIT_LIST_HEAD(&mnt->mnt_umounting);
#ifdef CONFIG_FSNOTIFY
INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
#endif
return 0;
mnt = real_mount(bastard);
mnt_add_count(mnt, 1);
+ smp_mb(); // see mntput_no_expire()
if (likely(!read_seqretry(&mount_lock, seq)))
return 0;
if (bastard->mnt_flags & MNT_SYNC_UMOUNT) {
mnt_add_count(mnt, -1);
return 1;
}
+ lock_mount_hash();
+ if (unlikely(bastard->mnt_flags & MNT_DOOMED)) {
+ mnt_add_count(mnt, -1);
+ unlock_mount_hash();
+ return 1;
+ }
+ unlock_mount_hash();
+ /* caller will mntput() */
return -1;
}
}
/*
- * find the last mount at @dentry on vfsmount @mnt.
- * mount_lock must be held.
- */
-struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry)
-{
- struct mount *p, *res = NULL;
- p = __lookup_mnt(mnt, dentry);
- if (!p)
- goto out;
- if (!(p->mnt.mnt_flags & MNT_UMOUNT))
- res = p;
- hlist_for_each_entry_continue(p, mnt_hash) {
- if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry)
- break;
- if (!(p->mnt.mnt_flags & MNT_UMOUNT))
- res = p;
- }
-out:
- return res;
-}
-
-/*
* lookup_mnt - Return the first child mount mounted at path
*
* "First" means first mounted chronologically. If you create the
return NULL;
}
-static struct mountpoint *new_mountpoint(struct dentry *dentry)
+static struct mountpoint *get_mountpoint(struct dentry *dentry)
{
- struct hlist_head *chain = mp_hash(dentry);
- struct mountpoint *mp;
+ struct mountpoint *mp, *new = NULL;
int ret;
- mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
- if (!mp)
+ if (d_mountpoint(dentry)) {
+mountpoint:
+ read_seqlock_excl(&mount_lock);
+ mp = lookup_mountpoint(dentry);
+ read_sequnlock_excl(&mount_lock);
+ if (mp)
+ goto done;
+ }
+
+ if (!new)
+ new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+ if (!new)
return ERR_PTR(-ENOMEM);
+
+ /* Exactly one processes may set d_mounted */
ret = d_set_mounted(dentry);
- if (ret) {
- kfree(mp);
- return ERR_PTR(ret);
- }
- mp->m_dentry = dentry;
- mp->m_count = 1;
- hlist_add_head(&mp->m_hash, chain);
- INIT_HLIST_HEAD(&mp->m_list);
+ /* Someone else set d_mounted? */
+ if (ret == -EBUSY)
+ goto mountpoint;
+
+ /* The dentry is not available as a mountpoint? */
+ mp = ERR_PTR(ret);
+ if (ret)
+ goto done;
+
+ /* Add the new mountpoint to the hash table */
+ read_seqlock_excl(&mount_lock);
+ new->m_dentry = dentry;
+ new->m_count = 1;
+ hlist_add_head(&new->m_hash, mp_hash(dentry));
+ INIT_HLIST_HEAD(&new->m_list);
+ read_sequnlock_excl(&mount_lock);
+
+ mp = new;
+ new = NULL;
+done:
+ kfree(new);
return mp;
}
hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list);
}
+static void __attach_mnt(struct mount *mnt, struct mount *parent)
+{
+ hlist_add_head_rcu(&mnt->mnt_hash,
+ m_hash(&parent->mnt, mnt->mnt_mountpoint));
+ list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+}
+
/*
* vfsmount lock must be held for write
*/
struct mountpoint *mp)
{
mnt_set_mountpoint(parent, mp, mnt);
- hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mp->m_dentry));
- list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+ __attach_mnt(mnt, parent);
}
-static void attach_shadowed(struct mount *mnt,
- struct mount *parent,
- struct mount *shadows)
+void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt)
{
- if (shadows) {
- hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash);
- list_add(&mnt->mnt_child, &shadows->mnt_child);
- } else {
- hlist_add_head_rcu(&mnt->mnt_hash,
- m_hash(&parent->mnt, mnt->mnt_mountpoint));
- list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
- }
+ struct mountpoint *old_mp = mnt->mnt_mp;
+ struct dentry *old_mountpoint = mnt->mnt_mountpoint;
+ struct mount *old_parent = mnt->mnt_parent;
+
+ list_del_init(&mnt->mnt_child);
+ hlist_del_init(&mnt->mnt_mp_list);
+ hlist_del_init_rcu(&mnt->mnt_hash);
+
+ attach_mnt(mnt, parent, mp);
+
+ put_mountpoint(old_mp);
+
+ /*
+ * Safely avoid even the suggestion this code might sleep or
+ * lock the mount hash by taking advantage of the knowledge that
+ * mnt_change_mountpoint will not release the final reference
+ * to a mountpoint.
+ *
+ * During mounting, the mount passed in as the parent mount will
+ * continue to use the old mountpoint and during unmounting, the
+ * old mountpoint will continue to exist until namespace_unlock,
+ * which happens well after mnt_change_mountpoint.
+ */
+ spin_lock(&old_mountpoint->d_lock);
+ old_mountpoint->d_lockref.count--;
+ spin_unlock(&old_mountpoint->d_lock);
+
+ mnt_add_count(old_parent, -1);
}
/*
* vfsmount lock must be held for write
*/
-static void commit_tree(struct mount *mnt, struct mount *shadows)
+static void commit_tree(struct mount *mnt)
{
struct mount *parent = mnt->mnt_parent;
struct mount *m;
list_splice(&head, n->list.prev);
- attach_shadowed(mnt, parent, shadows);
+ n->mounts += n->pending_mounts;
+ n->pending_mounts = 0;
+
+ __attach_mnt(mnt, parent);
touch_mnt_namespace(n);
}
goto out_free;
}
- mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED);
+ mnt->mnt.mnt_flags = old->mnt.mnt_flags;
+ mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
/* Don't allow unprivileged users to change mount flags */
if (flag & CL_UNPRIVILEGED) {
mnt->mnt.mnt_flags |= MNT_LOCK_ATIME;
static void mntput_no_expire(struct mount *mnt)
{
rcu_read_lock();
- mnt_add_count(mnt, -1);
- if (likely(mnt->mnt_ns)) { /* shouldn't be the last one */
+ if (likely(READ_ONCE(mnt->mnt_ns))) {
+ /*
+ * Since we don't do lock_mount_hash() here,
+ * ->mnt_ns can change under us. However, if it's
+ * non-NULL, then there's a reference that won't
+ * be dropped until after an RCU delay done after
+ * turning ->mnt_ns NULL. So if we observe it
+ * non-NULL under rcu_read_lock(), the reference
+ * we are dropping is not the final one.
+ */
+ mnt_add_count(mnt, -1);
rcu_read_unlock();
return;
}
lock_mount_hash();
+ /*
+ * make sure that if __legitimize_mnt() has not seen us grab
+ * mount_lock, we'll see their refcount increment here.
+ */
+ smp_mb();
+ mnt_add_count(mnt, -1);
if (mnt_get_count(mnt)) {
rcu_read_unlock();
unlock_mount_hash();
propagate_umount(&tmp_list);
while (!list_empty(&tmp_list)) {
+ struct mnt_namespace *ns;
bool disconnect;
p = list_first_entry(&tmp_list, struct mount, mnt_list);
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
- __touch_mnt_namespace(p->mnt_ns);
+ ns = p->mnt_ns;
+ if (ns) {
+ ns->mounts--;
+ __touch_mnt_namespace(ns);
+ }
p->mnt_ns = NULL;
if (how & UMOUNT_SYNC)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
namespace_lock();
lock_mount_hash();
- event++;
+ /* Recheck MNT_LOCKED with the locks held */
+ retval = -EINVAL;
+ if (mnt->mnt.mnt_flags & MNT_LOCKED)
+ goto out;
+
+ event++;
if (flags & MNT_DETACH) {
if (!list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
}
}
+out:
unlock_mount_hash();
namespace_unlock();
return retval;
struct mount *mnt;
namespace_lock();
+ lock_mount_hash();
mp = lookup_mountpoint(dentry);
if (IS_ERR_OR_NULL(mp))
goto out_unlock;
- lock_mount_hash();
+ event++;
while (!hlist_empty(&mp->m_list)) {
mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
if (mnt->mnt.mnt_flags & MNT_UMOUNT) {
}
else umount_tree(mnt, UMOUNT_CONNECTED);
}
- unlock_mount_hash();
put_mountpoint(mp);
out_unlock:
+ unlock_mount_hash();
namespace_unlock();
}
goto dput_and_out;
if (!check_mnt(mnt))
goto dput_and_out;
- if (mnt->mnt.mnt_flags & MNT_LOCKED)
+ if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
goto dput_and_out;
retval = -EPERM;
if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
continue;
for (s = r; s; s = next_mnt(s, r)) {
- struct mount *t = NULL;
if (!(flag & CL_COPY_UNBINDABLE) &&
IS_MNT_UNBINDABLE(s)) {
- s = skip_mnt_tree(s);
- continue;
+ if (s->mnt.mnt_flags & MNT_LOCKED) {
+ /* Both unbindable and locked. */
+ q = ERR_PTR(-EPERM);
+ goto out;
+ } else {
+ s = skip_mnt_tree(s);
+ continue;
+ }
}
if (!(flag & CL_COPY_MNT_NS_FILE) &&
is_mnt_ns_file(s->mnt.mnt_root)) {
goto out;
lock_mount_hash();
list_add_tail(&q->mnt_list, &res->mnt_list);
- mnt_set_mountpoint(parent, p->mnt_mp, q);
- if (!list_empty(&parent->mnt_mounts)) {
- t = list_last_entry(&parent->mnt_mounts,
- struct mount, mnt_child);
- if (t->mnt_mp != p->mnt_mp)
- t = NULL;
- }
- attach_shadowed(q, parent, t);
+ attach_mnt(q, parent, p->mnt_mp);
unlock_mount_hash();
}
}
{
namespace_lock();
lock_mount_hash();
- umount_tree(real_mount(mnt), UMOUNT_SYNC);
+ umount_tree(real_mount(mnt), 0);
unlock_mount_hash();
namespace_unlock();
}
return 0;
}
+int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
+{
+ unsigned int max = READ_ONCE(sysctl_mount_max);
+ unsigned int mounts = 0, old, pending, sum;
+ struct mount *p;
+
+ for (p = mnt; p; p = next_mnt(p, mnt))
+ mounts++;
+
+ old = ns->mounts;
+ pending = ns->pending_mounts;
+ sum = old + pending;
+ if ((old > sum) ||
+ (pending > sum) ||
+ (max < sum) ||
+ (mounts > (max - sum)))
+ return -ENOSPC;
+
+ ns->pending_mounts = pending + mounts;
+ return 0;
+}
+
/*
* @source_mnt : mount tree to be attached
* @nd : place the mount tree @source_mnt is attached
struct path *parent_path)
{
HLIST_HEAD(tree_list);
+ struct mnt_namespace *ns = dest_mnt->mnt_ns;
+ struct mountpoint *smp;
struct mount *child, *p;
struct hlist_node *n;
int err;
+ /* Preallocate a mountpoint in case the new mounts need
+ * to be tucked under other mounts.
+ */
+ smp = get_mountpoint(source_mnt->mnt.mnt_root);
+ if (IS_ERR(smp))
+ return PTR_ERR(smp);
+
+ /* Is there space to add these mounts to the mount namespace? */
+ if (!parent_path) {
+ err = count_mounts(ns, source_mnt);
+ if (err)
+ goto out;
+ }
+
if (IS_MNT_SHARED(dest_mnt)) {
err = invent_group_ids(source_mnt, true);
if (err)
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
- commit_tree(source_mnt, NULL);
+ commit_tree(source_mnt);
}
hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
struct mount *q;
hlist_del_init(&child->mnt_hash);
- q = __lookup_mnt_last(&child->mnt_parent->mnt,
- child->mnt_mountpoint);
- commit_tree(child, q);
+ q = __lookup_mnt(&child->mnt_parent->mnt,
+ child->mnt_mountpoint);
+ if (q)
+ mnt_change_mountpoint(child, smp, q);
+ commit_tree(child);
}
+ put_mountpoint(smp);
unlock_mount_hash();
return 0;
out_cleanup_ids:
while (!hlist_empty(&tree_list)) {
child = hlist_entry(tree_list.first, struct mount, mnt_hash);
+ child->mnt_parent->mnt_ns->pending_mounts = 0;
umount_tree(child, UMOUNT_SYNC);
}
unlock_mount_hash();
cleanup_group_ids(source_mnt, NULL);
out:
+ ns->pending_mounts = 0;
+
+ read_seqlock_excl(&mount_lock);
+ put_mountpoint(smp);
+ read_sequnlock_excl(&mount_lock);
+
return err;
}
namespace_lock();
mnt = lookup_mnt(path);
if (likely(!mnt)) {
- struct mountpoint *mp = lookup_mountpoint(dentry);
- if (!mp)
- mp = new_mountpoint(dentry);
+ struct mountpoint *mp = get_mountpoint(dentry);
if (IS_ERR(mp)) {
namespace_unlock();
mutex_unlock(&dentry->d_inode->i_mutex);
static void unlock_mount(struct mountpoint *where)
{
struct dentry *dentry = where->m_dentry;
+
+ read_seqlock_excl(&mount_lock);
put_mountpoint(where);
+ read_sequnlock_excl(&mount_lock);
+
namespace_unlock();
mutex_unlock(&dentry->d_inode->i_mutex);
}
mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
}
if (type->fs_flags & FS_USERNS_VISIBLE) {
- if (!fs_fully_visible(type, &mnt_flags))
+ if (!fs_fully_visible(type, &mnt_flags)) {
+ put_filesystem(type);
return -EPERM;
+ }
}
}
init_waitqueue_head(&new_ns->poll);
new_ns->event = 0;
new_ns->user_ns = get_user_ns(user_ns);
+ new_ns->mounts = 0;
+ new_ns->pending_mounts = 0;
return new_ns;
}
q = new;
while (p) {
q->mnt_ns = new_ns;
+ new_ns->mounts++;
if (new_fs) {
if (&p->mnt == new_fs->root.mnt) {
new_fs->root.mnt = mntget(&q->mnt);
struct mount *mnt = real_mount(m);
mnt->mnt_ns = new_ns;
new_ns->root = mnt;
+ new_ns->mounts++;
list_add(&mnt->mnt_list, &new_ns->list);
} else {
mntput(m);
touch_mnt_namespace(current->nsproxy->mnt_ns);
/* A moved mount should not expire automatically */
list_del_init(&new_mnt->mnt_expire);
+ put_mountpoint(root_mp);
unlock_mount_hash();
chroot_fs_refs(&root, &new);
- put_mountpoint(root_mp);
error = 0;
out4:
unlock_mount(old_mp);
if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);
+ /* Don't miss readonly hidden in the superblock flags */
+ if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY)
+ mnt_flags |= MNT_LOCK_READONLY;
+
/* Verify the mount flags are equal to or more permissive
* than the proposed new mount.
*/
list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) {
struct inode *inode = child->mnt_mountpoint->d_inode;
/* Only worry about locked mounts */
- if (!(mnt_flags & MNT_LOCKED))
+ if (!(child->mnt.mnt_flags & MNT_LOCKED))
continue;
/* Is the directory permanetly empty? */
if (!is_empty_dir_inode(inode))