]> bbs.cooldavid.org Git - net-next-2.6.git/blobdiff - fs/namespace.c
xps: Transmit Packet Steering
[net-next-2.6.git] / fs / namespace.c
index 2e10cb19c5b02983e159bfe5f8039f3d08f3035d..8a415c9c5e552efd945cdb85af6f74af4b2b6de6 100644 (file)
@@ -11,6 +11,8 @@
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
 #define HASH_SIZE (1UL << HASH_SHIFT)
 
-/* spinlock for vfsmount related operations, inplace of dcache_lock */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
-
 static int event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
 static int mnt_id_start = 0;
 static int mnt_group_start = 1;
 
@@ -55,6 +55,16 @@ static struct rw_semaphore namespace_sem;
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
 
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+DEFINE_BRLOCK(vfsmount_lock);
+
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
        unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -65,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 
 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
 
-/* allocation is serialized by namespace_sem */
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
 static int mnt_alloc_id(struct vfsmount *mnt)
 {
        int res;
 
 retry:
        ida_pre_get(&mnt_id_ida, GFP_KERNEL);
-       spin_lock(&vfsmount_lock);
+       spin_lock(&mnt_id_lock);
        res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
        if (!res)
                mnt_id_start = mnt->mnt_id + 1;
-       spin_unlock(&vfsmount_lock);
+       spin_unlock(&mnt_id_lock);
        if (res == -EAGAIN)
                goto retry;
 
@@ -86,11 +99,11 @@ retry:
 static void mnt_free_id(struct vfsmount *mnt)
 {
        int id = mnt->mnt_id;
-       spin_lock(&vfsmount_lock);
+       spin_lock(&mnt_id_lock);
        ida_remove(&mnt_id_ida, id);
        if (mnt_id_start > id)
                mnt_id_start = id;
-       spin_unlock(&vfsmount_lock);
+       spin_unlock(&mnt_id_lock);
 }
 
 /*
@@ -348,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
        int ret = 0;
 
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
        mnt->mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -382,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
         */
        smp_wmb();
        mnt->mnt_flags &= ~MNT_WRITE_HOLD;
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
        return ret;
 }
 
 static void __mnt_unmake_readonly(struct vfsmount *mnt)
 {
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
        mnt->mnt_flags &= ~MNT_READONLY;
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
 }
 
 void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -414,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
 /*
  * find the first or last mount at @dentry on vfsmount @mnt depending on
  * @dir. If @dir is set return the first mount else return the last mount.
+ * vfsmount_lock must be held for read or write.
  */
 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
                              int dir)
@@ -443,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 struct vfsmount *lookup_mnt(struct path *path)
 {
        struct vfsmount *child_mnt;
-       spin_lock(&vfsmount_lock);
+
+       br_read_lock(vfsmount_lock);
        if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
                mntget(child_mnt);
-       spin_unlock(&vfsmount_lock);
+       br_read_unlock(vfsmount_lock);
        return child_mnt;
 }
 
@@ -455,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
        return mnt->mnt_ns == current->nsproxy->mnt_ns;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void touch_mnt_namespace(struct mnt_namespace *ns)
 {
        if (ns) {
@@ -463,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
        }
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void __touch_mnt_namespace(struct mnt_namespace *ns)
 {
        if (ns && ns->event != event) {
@@ -471,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
        }
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 {
        old_path->dentry = mnt->mnt_mountpoint;
@@ -482,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
        old_path->dentry->d_mounted--;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
                        struct vfsmount *child_mnt)
 {
@@ -490,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
        dentry->d_mounted++;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void attach_mnt(struct vfsmount *mnt, struct path *path)
 {
        mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -499,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
 }
 
 /*
- * the caller must hold vfsmount_lock
+ * vfsmount lock must be held for write
  */
 static void commit_tree(struct vfsmount *mnt)
 {
@@ -565,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                                goto out_free;
                }
 
-               mnt->mnt_flags = old->mnt_flags;
+               mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
                atomic_inc(&sb->s_active);
                mnt->mnt_sb = sb;
                mnt->mnt_root = dget(root);
@@ -623,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
 void mntput_no_expire(struct vfsmount *mnt)
 {
 repeat:
-       if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
-               if (likely(!mnt->mnt_pinned)) {
-                       spin_unlock(&vfsmount_lock);
-                       __mntput(mnt);
-                       return;
-               }
-               atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
-               mnt->mnt_pinned = 0;
-               spin_unlock(&vfsmount_lock);
-               acct_auto_close_mnt(mnt);
-               goto repeat;
+       if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+               return;
+       br_write_lock(vfsmount_lock);
+       if (!atomic_dec_and_test(&mnt->mnt_count)) {
+               br_write_unlock(vfsmount_lock);
+               return;
        }
+       if (likely(!mnt->mnt_pinned)) {
+               br_write_unlock(vfsmount_lock);
+               __mntput(mnt);
+               return;
+       }
+       atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+       mnt->mnt_pinned = 0;
+       br_write_unlock(vfsmount_lock);
+       acct_auto_close_mnt(mnt);
+       goto repeat;
 }
-
 EXPORT_SYMBOL(mntput_no_expire);
 
 void mnt_pin(struct vfsmount *mnt)
 {
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
        mnt->mnt_pinned++;
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
 }
 
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *mnt)
 {
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
        if (mnt->mnt_pinned) {
                atomic_inc(&mnt->mnt_count);
                mnt->mnt_pinned--;
        }
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
 }
 
 EXPORT_SYMBOL(mnt_unpin);
@@ -746,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
        struct mnt_namespace *ns = p->ns;
        int res = 0;
 
-       spin_lock(&vfsmount_lock);
+       br_read_lock(vfsmount_lock);
        if (p->event != ns->event) {
                p->event = ns->event;
                res = 1;
        }
-       spin_unlock(&vfsmount_lock);
+       br_read_unlock(vfsmount_lock);
 
        return res;
 }
@@ -952,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
        int minimum_refs = 0;
        struct vfsmount *p;
 
-       spin_lock(&vfsmount_lock);
+       br_read_lock(vfsmount_lock);
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += atomic_read(&p->mnt_count);
                minimum_refs += 2;
        }
-       spin_unlock(&vfsmount_lock);
+       br_read_unlock(vfsmount_lock);
 
        if (actual_refs > minimum_refs)
                return 0;
@@ -984,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
        down_read(&namespace_sem);
-       spin_lock(&vfsmount_lock);
+       br_read_lock(vfsmount_lock);
        if (propagate_mount_busy(mnt, 2))
                ret = 0;
-       spin_unlock(&vfsmount_lock);
+       br_read_unlock(vfsmount_lock);
        up_read(&namespace_sem);
        return ret;
 }
@@ -1003,13 +1037,14 @@ void release_mounts(struct list_head *head)
                if (mnt->mnt_parent != mnt) {
                        struct dentry *dentry;
                        struct vfsmount *m;
-                       spin_lock(&vfsmount_lock);
+
+                       br_write_lock(vfsmount_lock);
                        dentry = mnt->mnt_mountpoint;
                        m = mnt->mnt_parent;
                        mnt->mnt_mountpoint = mnt->mnt_root;
                        mnt->mnt_parent = mnt;
                        m->mnt_ghosts--;
-                       spin_unlock(&vfsmount_lock);
+                       br_write_unlock(vfsmount_lock);
                        dput(dentry);
                        mntput(m);
                }
@@ -1017,6 +1052,10 @@ void release_mounts(struct list_head *head)
        }
 }
 
+/*
+ * vfsmount lock must be held for write
+ * namespace_sem must be held for write
+ */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
        struct vfsmount *p;
@@ -1107,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
        }
 
        down_write(&namespace_sem);
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
        event++;
 
        if (!(flags & MNT_DETACH))
@@ -1119,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
                        umount_tree(mnt, 1, &umount_list);
                retval = 0;
        }
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        return retval;
@@ -1231,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
                        q = clone_mnt(p, p->mnt_root, flag);
                        if (!q)
                                goto Enomem;
-                       spin_lock(&vfsmount_lock);
+                       br_write_lock(vfsmount_lock);
                        list_add_tail(&q->mnt_list, &res->mnt_list);
                        attach_mnt(q, &path);
-                       spin_unlock(&vfsmount_lock);
+                       br_write_unlock(vfsmount_lock);
                }
        }
        return res;
 Enomem:
        if (res) {
                LIST_HEAD(umount_list);
-               spin_lock(&vfsmount_lock);
+               br_write_lock(vfsmount_lock);
                umount_tree(res, 0, &umount_list);
-               spin_unlock(&vfsmount_lock);
+               br_write_unlock(vfsmount_lock);
                release_mounts(&umount_list);
        }
        return NULL;
@@ -1262,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
 {
        LIST_HEAD(umount_list);
        down_write(&namespace_sem);
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
        umount_tree(mnt, 0, &umount_list);
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
 }
@@ -1392,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
        if (err)
                goto out_cleanup_ids;
 
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
 
        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1411,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
                list_del_init(&child->mnt_hash);
                commit_tree(child);
        }
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
+
        return 0;
 
  out_cleanup_ids:
@@ -1443,6 +1483,23 @@ out_unlock:
        return err;
 }
 
+/*
+ * Sanity check the flags to change_mnt_propagation.
+ */
+
+static int flags_to_propagation_type(int flags)
+{
+       int type = flags & ~MS_REC;
+
+       /* Fail if any non-propagation flags are set */
+       if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+               return 0;
+       /* Only one propagation flag should be set */
+       if (!is_power_of_2(type))
+               return 0;
+       return type;
+}
+
 /*
  * recursively change the type of the mountpoint.
  */
@@ -1450,7 +1507,7 @@ static int do_change_type(struct path *path, int flag)
 {
        struct vfsmount *m, *mnt = path->mnt;
        int recurse = flag & MS_REC;
-       int type = flag & ~MS_REC;
+       int type;
        int err = 0;
 
        if (!capable(CAP_SYS_ADMIN))
@@ -1459,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
        if (path->dentry != path->mnt->mnt_root)
                return -EINVAL;
 
+       type = flags_to_propagation_type(flag);
+       if (!type)
+               return -EINVAL;
+
        down_write(&namespace_sem);
        if (type == MS_SHARED) {
                err = invent_group_ids(mnt, recurse);
@@ -1466,10 +1527,10 @@ static int do_change_type(struct path *path, int flag)
                        goto out_unlock;
        }
 
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
 
  out_unlock:
        up_write(&namespace_sem);
@@ -1513,9 +1574,10 @@ static int do_loopback(struct path *path, char *old_name,
        err = graft_tree(mnt, path);
        if (err) {
                LIST_HEAD(umount_list);
-               spin_lock(&vfsmount_lock);
+
+               br_write_lock(vfsmount_lock);
                umount_tree(mnt, 0, &umount_list);
-               spin_unlock(&vfsmount_lock);
+               br_write_unlock(vfsmount_lock);
                release_mounts(&umount_list);
        }
 
@@ -1568,16 +1630,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        else
                err = do_remount_sb(sb, flags, data, 0);
        if (!err) {
-               spin_lock(&vfsmount_lock);
+               br_write_lock(vfsmount_lock);
                mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
                path->mnt->mnt_flags = mnt_flags;
-               spin_unlock(&vfsmount_lock);
+               br_write_unlock(vfsmount_lock);
        }
        up_write(&sb->s_umount);
        if (!err) {
-               spin_lock(&vfsmount_lock);
+               br_write_lock(vfsmount_lock);
                touch_mnt_namespace(path->mnt->mnt_ns);
-               spin_unlock(&vfsmount_lock);
+               br_write_unlock(vfsmount_lock);
        }
        return err;
 }
@@ -1682,9 +1744,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
 
-       lock_kernel();
        mnt = do_kern_mount(type, flags, name, data);
-       unlock_kernel();
        if (IS_ERR(mnt))
                return PTR_ERR(mnt);
 
@@ -1754,7 +1814,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                return;
 
        down_write(&namespace_sem);
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
 
        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
@@ -1773,7 +1833,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, 1, &umounts);
        }
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
 
        release_mounts(&umounts);
@@ -1830,6 +1890,8 @@ resume:
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * submounts of a specific parent mountpoint
+ *
+ * vfsmount_lock must be held for write
  */
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 {
@@ -2048,9 +2110,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                kfree(new_ns);
                return ERR_PTR(-ENOMEM);
        }
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
        list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
 
        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2244,7 +2306,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                goto out2; /* not attached */
        /* make sure we can reach put_old from new_root */
        tmp = old.mnt;
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
        if (tmp != new.mnt) {
                for (;;) {
                        if (tmp->mnt_parent == tmp)
@@ -2264,7 +2326,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        /* mount new_root on / */
        attach_mnt(new.mnt, &root_parent);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
        path_put(&root_parent);
@@ -2279,7 +2341,7 @@ out1:
 out0:
        return error;
 out3:
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
        goto out2;
 }
 
@@ -2326,6 +2388,8 @@ void __init mnt_init(void)
        for (u = 0; u < HASH_SIZE; u++)
                INIT_LIST_HEAD(&mount_hashtable[u]);
 
+       br_lock_init(vfsmount_lock);
+
        err = sysfs_init();
        if (err)
                printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2344,9 +2408,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
        if (!atomic_dec_and_test(&ns->count))
                return;
        down_write(&namespace_sem);
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
        umount_tree(ns->root, 0, &umount_list);
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        kfree(ns);