xps: Transmit Packet Steering

[net-next-2.6.git] / fs / namespace.c
diff --git a/fs/namespace.c b/fs/namespace.c

index 2e10cb19c5b02983e159bfe5f8039f3d08f3035d..8a415c9c5e552efd945cdb85af6f74af4b2b6de6 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
  #include <linux/syscalls.h>
  #include <linux/slab.h>
  #include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
  #include <linux/smp_lock.h>
  #include <linux/init.h>
  #include <linux/kernel.h>
@@ -38,12 +40,10 @@
  #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
  #define HASH_SIZE (1UL << HASH_SHIFT)
  
-/* spinlock for vfsmount related operations, inplace of dcache_lock */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
-
  static int event;
  static DEFINE_IDA(mnt_id_ida);
  static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
  static int mnt_id_start = 0;
  static int mnt_group_start = 1;
  
@@ -55,6 +55,16 @@ static struct rw_semaphore namespace_sem;
  struct kobject *fs_kobj;
  EXPORT_SYMBOL_GPL(fs_kobj);
  
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+DEFINE_BRLOCK(vfsmount_lock);
+
  static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
  {
         unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -65,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
  
  #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
  
-/* allocation is serialized by namespace_sem */
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
  static int mnt_alloc_id(struct vfsmount *mnt)
  {
         int res;
  
  retry:
         ida_pre_get(&mnt_id_ida, GFP_KERNEL);
-       spin_lock(&vfsmount_lock);
+       spin_lock(&mnt_id_lock);
         res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
         if (!res)
                 mnt_id_start = mnt->mnt_id + 1;
-       spin_unlock(&vfsmount_lock);
+       spin_unlock(&mnt_id_lock);
         if (res == -EAGAIN)
                 goto retry;
  
@@ -86,11 +99,11 @@ retry:
  static void mnt_free_id(struct vfsmount *mnt)
  {
         int id = mnt->mnt_id;
-       spin_lock(&vfsmount_lock);
+       spin_lock(&mnt_id_lock);
         ida_remove(&mnt_id_ida, id);
         if (mnt_id_start > id)
                 mnt_id_start = id;
-       spin_unlock(&vfsmount_lock);
+       spin_unlock(&mnt_id_lock);
  }
  
  /*
@@ -348,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
  {
         int ret = 0;
  
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
         mnt->mnt_flags |= MNT_WRITE_HOLD;
         /*
          * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -382,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
          */
         smp_wmb();
         mnt->mnt_flags &= ~MNT_WRITE_HOLD;
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
         return ret;
  }
  
  static void __mnt_unmake_readonly(struct vfsmount *mnt)
  {
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
         mnt->mnt_flags &= ~MNT_READONLY;
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
  }
  
  void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -414,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
  /*
   * find the first or last mount at @dentry on vfsmount @mnt depending on
   * @dir. If @dir is set return the first mount else return the last mount.
+ * vfsmount_lock must be held for read or write.
   */
  struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
                               int dir)
@@ -443,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
  struct vfsmount *lookup_mnt(struct path *path)
  {
         struct vfsmount *child_mnt;
-       spin_lock(&vfsmount_lock);
+
+       br_read_lock(vfsmount_lock);
         if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
                 mntget(child_mnt);
-       spin_unlock(&vfsmount_lock);
+       br_read_unlock(vfsmount_lock);
         return child_mnt;
  }
  
@@ -455,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
         return mnt->mnt_ns == current->nsproxy->mnt_ns;
  }
  
+/*
+ * vfsmount lock must be held for write
+ */
  static void touch_mnt_namespace(struct mnt_namespace *ns)
  {
         if (ns) {
@@ -463,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
         }
  }
  
+/*
+ * vfsmount lock must be held for write
+ */
  static void __touch_mnt_namespace(struct mnt_namespace *ns)
  {
         if (ns && ns->event != event) {
@@ -471,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
         }
  }
  
+/*
+ * vfsmount lock must be held for write
+ */
  static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
  {
         old_path->dentry = mnt->mnt_mountpoint;
@@ -482,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
         old_path->dentry->d_mounted--;
  }
  
+/*
+ * vfsmount lock must be held for write
+ */
  void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
                         struct vfsmount *child_mnt)
  {
@@ -490,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
         dentry->d_mounted++;
  }
  
+/*
+ * vfsmount lock must be held for write
+ */
  static void attach_mnt(struct vfsmount *mnt, struct path *path)
  {
         mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -499,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
  }
  
  /*
- * the caller must hold vfsmount_lock
+ * vfsmount lock must be held for write
   */
  static void commit_tree(struct vfsmount *mnt)
  {
@@ -565,7 +595,7 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
                                 goto out_free;
                 }
  
-               mnt->mnt_flags = old->mnt_flags;
+               mnt->mnt_flags = old->mnt_flags & ~MNT_WRITE_HOLD;
                 atomic_inc(&sb->s_active);
                 mnt->mnt_sb = sb;
                 mnt->mnt_root = dget(root);
@@ -623,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
  void mntput_no_expire(struct vfsmount *mnt)
  {
  repeat:
-       if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
-               if (likely(!mnt->mnt_pinned)) {
-                       spin_unlock(&vfsmount_lock);
-                       __mntput(mnt);
-                       return;
-               }
-               atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
-               mnt->mnt_pinned = 0;
-               spin_unlock(&vfsmount_lock);
-               acct_auto_close_mnt(mnt);
-               goto repeat;
+       if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+               return;
+       br_write_lock(vfsmount_lock);
+       if (!atomic_dec_and_test(&mnt->mnt_count)) {
+               br_write_unlock(vfsmount_lock);
+               return;
         }
+       if (likely(!mnt->mnt_pinned)) {
+               br_write_unlock(vfsmount_lock);
+               __mntput(mnt);
+               return;
+       }
+       atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+       mnt->mnt_pinned = 0;
+       br_write_unlock(vfsmount_lock);
+       acct_auto_close_mnt(mnt);
+       goto repeat;
  }
-
  EXPORT_SYMBOL(mntput_no_expire);
  
  void mnt_pin(struct vfsmount *mnt)
  {
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
         mnt->mnt_pinned++;
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
  }
  
  EXPORT_SYMBOL(mnt_pin);
  
  void mnt_unpin(struct vfsmount *mnt)
  {
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
         if (mnt->mnt_pinned) {
                 atomic_inc(&mnt->mnt_count);
                 mnt->mnt_pinned--;
         }
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
  }
  
  EXPORT_SYMBOL(mnt_unpin);
@@ -746,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
         struct mnt_namespace *ns = p->ns;
         int res = 0;
  
-       spin_lock(&vfsmount_lock);
+       br_read_lock(vfsmount_lock);
         if (p->event != ns->event) {
                 p->event = ns->event;
                 res = 1;
         }
-       spin_unlock(&vfsmount_lock);
+       br_read_unlock(vfsmount_lock);
  
         return res;
  }
@@ -952,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
         int minimum_refs = 0;
         struct vfsmount *p;
  
-       spin_lock(&vfsmount_lock);
+       br_read_lock(vfsmount_lock);
         for (p = mnt; p; p = next_mnt(p, mnt)) {
                 actual_refs += atomic_read(&p->mnt_count);
                 minimum_refs += 2;
         }
-       spin_unlock(&vfsmount_lock);
+       br_read_unlock(vfsmount_lock);
  
         if (actual_refs > minimum_refs)
                 return 0;
@@ -984,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
  {
         int ret = 1;
         down_read(&namespace_sem);
-       spin_lock(&vfsmount_lock);
+       br_read_lock(vfsmount_lock);
         if (propagate_mount_busy(mnt, 2))
                 ret = 0;
-       spin_unlock(&vfsmount_lock);
+       br_read_unlock(vfsmount_lock);
         up_read(&namespace_sem);
         return ret;
  }
@@ -1003,13 +1037,14 @@ void release_mounts(struct list_head *head)
                 if (mnt->mnt_parent != mnt) {
                         struct dentry *dentry;
                         struct vfsmount *m;
-                       spin_lock(&vfsmount_lock);
+
+                       br_write_lock(vfsmount_lock);
                         dentry = mnt->mnt_mountpoint;
                         m = mnt->mnt_parent;
                         mnt->mnt_mountpoint = mnt->mnt_root;
                         mnt->mnt_parent = mnt;
                         m->mnt_ghosts--;
-                       spin_unlock(&vfsmount_lock);
+                       br_write_unlock(vfsmount_lock);
                         dput(dentry);
                         mntput(m);
                 }
@@ -1017,6 +1052,10 @@ void release_mounts(struct list_head *head)
         }
  }
  
+/*
+ * vfsmount lock must be held for write
+ * namespace_sem must be held for write
+ */
  void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
  {
         struct vfsmount *p;
@@ -1107,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
         }
  
         down_write(&namespace_sem);
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
         event++;
  
         if (!(flags & MNT_DETACH))
@@ -1119,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
                         umount_tree(mnt, 1, &umount_list);
                 retval = 0;
         }
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
         up_write(&namespace_sem);
         release_mounts(&umount_list);
         return retval;
@@ -1231,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
                         q = clone_mnt(p, p->mnt_root, flag);
                         if (!q)
                                 goto Enomem;
-                       spin_lock(&vfsmount_lock);
+                       br_write_lock(vfsmount_lock);
                         list_add_tail(&q->mnt_list, &res->mnt_list);
                         attach_mnt(q, &path);
-                       spin_unlock(&vfsmount_lock);
+                       br_write_unlock(vfsmount_lock);
                 }
         }
         return res;
  Enomem:
         if (res) {
                 LIST_HEAD(umount_list);
-               spin_lock(&vfsmount_lock);
+               br_write_lock(vfsmount_lock);
                 umount_tree(res, 0, &umount_list);
-               spin_unlock(&vfsmount_lock);
+               br_write_unlock(vfsmount_lock);
                 release_mounts(&umount_list);
         }
         return NULL;
@@ -1262,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
  {
         LIST_HEAD(umount_list);
         down_write(&namespace_sem);
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
         umount_tree(mnt, 0, &umount_list);
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
         up_write(&namespace_sem);
         release_mounts(&umount_list);
  }
@@ -1392,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
         if (err)
                 goto out_cleanup_ids;
  
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
  
         if (IS_MNT_SHARED(dest_mnt)) {
                 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1411,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
                 list_del_init(&child->mnt_hash);
                 commit_tree(child);
         }
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
+
         return 0;
  
   out_cleanup_ids:
@@ -1443,6 +1483,23 @@ out_unlock:
         return err;
  }
  
+/*
+ * Sanity check the flags to change_mnt_propagation.
+ */
+
+static int flags_to_propagation_type(int flags)
+{
+       int type = flags & ~MS_REC;
+
+       /* Fail if any non-propagation flags are set */
+       if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+               return 0;
+       /* Only one propagation flag should be set */
+       if (!is_power_of_2(type))
+               return 0;
+       return type;
+}
+
  /*
   * recursively change the type of the mountpoint.
   */
@@ -1450,7 +1507,7 @@ static int do_change_type(struct path *path, int flag)
  {
         struct vfsmount *m, *mnt = path->mnt;
         int recurse = flag & MS_REC;
-       int type = flag & ~MS_REC;
+       int type;
         int err = 0;
  
         if (!capable(CAP_SYS_ADMIN))
@@ -1459,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
         if (path->dentry != path->mnt->mnt_root)
                 return -EINVAL;
  
+       type = flags_to_propagation_type(flag);
+       if (!type)
+               return -EINVAL;
+
         down_write(&namespace_sem);
         if (type == MS_SHARED) {
                 err = invent_group_ids(mnt, recurse);
@@ -1466,10 +1527,10 @@ static int do_change_type(struct path *path, int flag)
                         goto out_unlock;
         }
  
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
         for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                 change_mnt_propagation(m, type);
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
  
   out_unlock:
         up_write(&namespace_sem);
@@ -1513,9 +1574,10 @@ static int do_loopback(struct path *path, char *old_name,
         err = graft_tree(mnt, path);
         if (err) {
                 LIST_HEAD(umount_list);
-               spin_lock(&vfsmount_lock);
+
+               br_write_lock(vfsmount_lock);
                 umount_tree(mnt, 0, &umount_list);
-               spin_unlock(&vfsmount_lock);
+               br_write_unlock(vfsmount_lock);
                 release_mounts(&umount_list);
         }
  
@@ -1568,16 +1630,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
         else
                 err = do_remount_sb(sb, flags, data, 0);
         if (!err) {
-               spin_lock(&vfsmount_lock);
+               br_write_lock(vfsmount_lock);
                 mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
                 path->mnt->mnt_flags = mnt_flags;
-               spin_unlock(&vfsmount_lock);
+               br_write_unlock(vfsmount_lock);
         }
         up_write(&sb->s_umount);
         if (!err) {
-               spin_lock(&vfsmount_lock);
+               br_write_lock(vfsmount_lock);
                 touch_mnt_namespace(path->mnt->mnt_ns);
-               spin_unlock(&vfsmount_lock);
+               br_write_unlock(vfsmount_lock);
         }
         return err;
  }
@@ -1682,9 +1744,7 @@ static int do_new_mount(struct path *path, char *type, int flags,
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       lock_kernel();
         mnt = do_kern_mount(type, flags, name, data);
-       unlock_kernel();
         if (IS_ERR(mnt))
                 return PTR_ERR(mnt);
  
@@ -1754,7 +1814,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                 return;
  
         down_write(&namespace_sem);
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
  
         /* extract from the expiration list every vfsmount that matches the
          * following criteria:
@@ -1773,7 +1833,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                 touch_mnt_namespace(mnt->mnt_ns);
                 umount_tree(mnt, 1, &umounts);
         }
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
         up_write(&namespace_sem);
  
         release_mounts(&umounts);
@@ -1830,6 +1890,8 @@ resume:
  /*
   * process a list of expirable mountpoints with the intent of discarding any
   * submounts of a specific parent mountpoint
+ *
+ * vfsmount_lock must be held for write
   */
  static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
  {
@@ -2048,9 +2110,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                 kfree(new_ns);
                 return ERR_PTR(-ENOMEM);
         }
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
         list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
  
         /*
          * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2244,7 +2306,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
                 goto out2; /* not attached */
         /* make sure we can reach put_old from new_root */
         tmp = old.mnt;
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
         if (tmp != new.mnt) {
                 for (;;) {
                         if (tmp->mnt_parent == tmp)
@@ -2264,7 +2326,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
         /* mount new_root on / */
         attach_mnt(new.mnt, &root_parent);
         touch_mnt_namespace(current->nsproxy->mnt_ns);
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
         chroot_fs_refs(&root, &new);
         error = 0;
         path_put(&root_parent);
@@ -2279,7 +2341,7 @@ out1:
  out0:
         return error;
  out3:
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
         goto out2;
  }
  
@@ -2326,6 +2388,8 @@ void __init mnt_init(void)
         for (u = 0; u < HASH_SIZE; u++)
                 INIT_LIST_HEAD(&mount_hashtable[u]);
  
+       br_lock_init(vfsmount_lock);
+
         err = sysfs_init();
         if (err)
                 printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2344,9 +2408,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
         if (!atomic_dec_and_test(&ns->count))
                 return;
         down_write(&namespace_sem);
-       spin_lock(&vfsmount_lock);
+       br_write_lock(vfsmount_lock);
         umount_tree(ns->root, 0, &umount_list);
-       spin_unlock(&vfsmount_lock);
+       br_write_unlock(vfsmount_lock);
         up_write(&namespace_sem);
         release_mounts(&umount_list);
         kfree(ns);