]> bbs.cooldavid.org Git - net-next-2.6.git/blobdiff - drivers/md/raid5.c
block: unify flags for struct bio and struct request
[net-next-2.6.git] / drivers / md / raid5.c
index d2c0f94fa37d7a708b1c6854e7be3b1d3b60162f..20ac2f14376a1417552982697cde2b303af7afd2 100644 (file)
@@ -277,12 +277,13 @@ out:
        return sh;
 }
 
-static void shrink_buffers(struct stripe_head *sh, int num)
+static void shrink_buffers(struct stripe_head *sh)
 {
        struct page *p;
        int i;
+       int num = sh->raid_conf->pool_size;
 
-       for (i=0; i<num ; i++) {
+       for (i = 0; i < num ; i++) {
                p = sh->dev[i].page;
                if (!p)
                        continue;
@@ -291,11 +292,12 @@ static void shrink_buffers(struct stripe_head *sh, int num)
        }
 }
 
-static int grow_buffers(struct stripe_head *sh, int num)
+static int grow_buffers(struct stripe_head *sh)
 {
        int i;
+       int num = sh->raid_conf->pool_size;
 
-       for (i=0; i<num; i++) {
+       for (i = 0; i < num; i++) {
                struct page *page;
 
                if (!(page = alloc_page(GFP_KERNEL))) {
@@ -364,6 +366,73 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
        return NULL;
 }
 
+/*
+ * Need to check if array has failed when deciding whether to:
+ *  - start an array
+ *  - remove non-faulty devices
+ *  - add a spare
+ *  - allow a reshape
+ * This determination is simple when no reshape is happening.
+ * However if there is a reshape, we need to carefully check
+ * both the before and after sections.
+ * This is because some failed devices may only affect one
+ * of the two sections, and some non-in_sync devices may
+ * be insync in the section most affected by failed devices.
+ */
+static int has_failed(raid5_conf_t *conf)
+{
+       int degraded;
+       int i;
+       if (conf->mddev->reshape_position == MaxSector)
+               return conf->mddev->degraded > conf->max_degraded;
+
+       rcu_read_lock();
+       degraded = 0;
+       for (i = 0; i < conf->previous_raid_disks; i++) {
+               mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+               if (!rdev || test_bit(Faulty, &rdev->flags))
+                       degraded++;
+               else if (test_bit(In_sync, &rdev->flags))
+                       ;
+               else
+                       /* not in-sync or faulty.
+                        * If the reshape increases the number of devices,
+                        * this is being recovered by the reshape, so
+                        * this 'previous' section is not in_sync.
+                        * If the number of devices is being reduced however,
+                        * the device can only be part of the array if
+                        * we are reverting a reshape, so this section will
+                        * be in-sync.
+                        */
+                       if (conf->raid_disks >= conf->previous_raid_disks)
+                               degraded++;
+       }
+       rcu_read_unlock();
+       if (degraded > conf->max_degraded)
+               return 1;
+       rcu_read_lock();
+       degraded = 0;
+       for (i = 0; i < conf->raid_disks; i++) {
+               mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
+               if (!rdev || test_bit(Faulty, &rdev->flags))
+                       degraded++;
+               else if (test_bit(In_sync, &rdev->flags))
+                       ;
+               else
+                       /* not in-sync or faulty.
+                        * If reshape increases the number of devices, this
+                        * section has already been recovered, else it
+                        * almost certainly hasn't.
+                        */
+                       if (conf->raid_disks <= conf->previous_raid_disks)
+                               degraded++;
+       }
+       rcu_read_unlock();
+       if (degraded > conf->max_degraded)
+               return 1;
+       return 0;
+}
+
 static void unplug_slaves(mddev_t *mddev);
 static void raid5_unplug_device(struct request_queue *q);
 
@@ -1240,19 +1309,18 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 static int grow_one_stripe(raid5_conf_t *conf)
 {
        struct stripe_head *sh;
-       int disks = max(conf->raid_disks, conf->previous_raid_disks);
        sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
        if (!sh)
                return 0;
-       memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev));
+       memset(sh, 0, sizeof(*sh) + (conf->pool_size-1)*sizeof(struct r5dev));
        sh->raid_conf = conf;
        spin_lock_init(&sh->lock);
        #ifdef CONFIG_MULTICORE_RAID456
        init_waitqueue_head(&sh->ops.wait_for_ops);
        #endif
 
-       if (grow_buffers(sh, disks)) {
-               shrink_buffers(sh, disks);
+       if (grow_buffers(sh)) {
+               shrink_buffers(sh);
                kmem_cache_free(conf->slab_cache, sh);
                return 0;
        }
@@ -1468,7 +1536,7 @@ static int drop_one_stripe(raid5_conf_t *conf)
        if (!sh)
                return 0;
        BUG_ON(atomic_read(&sh->count));
-       shrink_buffers(sh, conf->pool_size);
+       shrink_buffers(sh);
        kmem_cache_free(conf->slab_cache, sh);
        atomic_dec(&conf->active_stripes);
        return 1;
@@ -2963,7 +3031,6 @@ static void handle_stripe5(struct stripe_head *sh)
                mdk_rdev_t *rdev;
 
                dev = &sh->dev[i];
-               clear_bit(R5_Insync, &dev->flags);
 
                pr_debug("check %d: state 0x%lx toread %p read %p write %p "
                        "written %p\n", i, dev->flags, dev->toread, dev->read,
@@ -3000,17 +3067,27 @@ static void handle_stripe5(struct stripe_head *sh)
                        blocked_rdev = rdev;
                        atomic_inc(&rdev->nr_pending);
                }
-               if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+               clear_bit(R5_Insync, &dev->flags);
+               if (!rdev)
+                       /* Not in-sync */;
+               else if (test_bit(In_sync, &rdev->flags))
+                       set_bit(R5_Insync, &dev->flags);
+               else {
+                       /* could be in-sync depending on recovery/reshape status */
+                       if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+                               set_bit(R5_Insync, &dev->flags);
+               }
+               if (!test_bit(R5_Insync, &dev->flags)) {
                        /* The ReadError flag will just be confusing now */
                        clear_bit(R5_ReadError, &dev->flags);
                        clear_bit(R5_ReWrite, &dev->flags);
                }
-               if (!rdev || !test_bit(In_sync, &rdev->flags)
-                   || test_bit(R5_ReadError, &dev->flags)) {
+               if (test_bit(R5_ReadError, &dev->flags))
+                       clear_bit(R5_Insync, &dev->flags);
+               if (!test_bit(R5_Insync, &dev->flags)) {
                        s.failed++;
                        s.failed_num = i;
-               } else
-                       set_bit(R5_Insync, &dev->flags);
+               }
        }
        rcu_read_unlock();
 
@@ -3244,7 +3321,6 @@ static void handle_stripe6(struct stripe_head *sh)
        for (i=disks; i--; ) {
                mdk_rdev_t *rdev;
                dev = &sh->dev[i];
-               clear_bit(R5_Insync, &dev->flags);
 
                pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
                        i, dev->flags, dev->toread, dev->towrite, dev->written);
@@ -3282,18 +3358,28 @@ static void handle_stripe6(struct stripe_head *sh)
                        blocked_rdev = rdev;
                        atomic_inc(&rdev->nr_pending);
                }
-               if (!rdev || !test_bit(In_sync, &rdev->flags)) {
+               clear_bit(R5_Insync, &dev->flags);
+               if (!rdev)
+                       /* Not in-sync */;
+               else if (test_bit(In_sync, &rdev->flags))
+                       set_bit(R5_Insync, &dev->flags);
+               else {
+                       /* in sync if before recovery_offset */
+                       if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset)
+                               set_bit(R5_Insync, &dev->flags);
+               }
+               if (!test_bit(R5_Insync, &dev->flags)) {
                        /* The ReadError flag will just be confusing now */
                        clear_bit(R5_ReadError, &dev->flags);
                        clear_bit(R5_ReWrite, &dev->flags);
                }
-               if (!rdev || !test_bit(In_sync, &rdev->flags)
-                   || test_bit(R5_ReadError, &dev->flags)) {
+               if (test_bit(R5_ReadError, &dev->flags))
+                       clear_bit(R5_Insync, &dev->flags);
+               if (!test_bit(R5_Insync, &dev->flags)) {
                        if (s.failed < 2)
                                r6s.failed_num[s.failed] = i;
                        s.failed++;
-               } else
-                       set_bit(R5_Insync, &dev->flags);
+               }
        }
        rcu_read_unlock();
 
@@ -3872,7 +3958,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
        const int rw = bio_data_dir(bi);
        int remaining;
 
-       if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
+       if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) {
                /* Drain all pending writes.  We only really need
                 * to ensure they have been submitted, but this is
                 * easier.
@@ -4971,8 +5057,10 @@ static int run(mddev_t *mddev)
        list_for_each_entry(rdev, &mddev->disks, same_set) {
                if (rdev->raid_disk < 0)
                        continue;
-               if (test_bit(In_sync, &rdev->flags))
+               if (test_bit(In_sync, &rdev->flags)) {
                        working_disks++;
+                       continue;
+               }
                /* This disc is not fully in-sync.  However if it
                 * just stored parity (beyond the recovery_offset),
                 * when we don't need to be concerned about the
@@ -5005,7 +5093,7 @@ static int run(mddev_t *mddev)
        mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
                           - working_disks);
 
-       if (mddev->degraded > conf->max_degraded) {
+       if (has_failed(conf)) {
                printk(KERN_ERR "md/raid:%s: not enough operational devices"
                        " (%d/%d failed)\n",
                        mdname(mddev), mddev->degraded, conf->raid_disks);
@@ -5207,6 +5295,7 @@ static int raid5_spare_active(mddev_t *mddev)
        for (i = 0; i < conf->raid_disks; i++) {
                tmp = conf->disks + i;
                if (tmp->rdev
+                   && tmp->rdev->recovery_offset == MaxSector
                    && !test_bit(Faulty, &tmp->rdev->flags)
                    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
                        unsigned long flags;
@@ -5242,7 +5331,7 @@ static int raid5_remove_disk(mddev_t *mddev, int number)
                 * isn't possible.
                 */
                if (!test_bit(Faulty, &rdev->flags) &&
-                   mddev->degraded <= conf->max_degraded &&
+                   !has_failed(conf) &&
                    number < conf->raid_disks) {
                        err = -EBUSY;
                        goto abort;
@@ -5270,7 +5359,7 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
        int first = 0;
        int last = conf->raid_disks - 1;
 
-       if (mddev->degraded > conf->max_degraded)
+       if (has_failed(conf))
                /* no point adding a device */
                return -EINVAL;
 
@@ -5362,7 +5451,7 @@ static int check_reshape(mddev_t *mddev)
        if (mddev->bitmap)
                /* Cannot grow a bitmap yet */
                return -EBUSY;
-       if (mddev->degraded > conf->max_degraded)
+       if (has_failed(conf))
                return -EINVAL;
        if (mddev->delta_disks < 0) {
                /* We might be able to shrink, but the devices must
@@ -5437,8 +5526,13 @@ static int raid5_start_reshape(mddev_t *mddev)
 
        /* Add some new drives, as many as will fit.
         * We know there are enough to make the newly sized array work.
+        * Don't add devices if we are reducing the number of
+        * devices in the array.  This is because it is not possible
+        * to correctly record the "partially reconstructed" state of
+        * such devices during the reshape and confusion could result.
         */
-       list_for_each_entry(rdev, &mddev->disks, same_set)
+       if (mddev->delta_disks >= 0)
+           list_for_each_entry(rdev, &mddev->disks, same_set)
                if (rdev->raid_disk < 0 &&
                    !test_bit(Faulty, &rdev->flags)) {
                        if (raid5_add_disk(mddev, rdev) == 0) {
@@ -5460,7 +5554,7 @@ static int raid5_start_reshape(mddev_t *mddev)
                }
 
        /* When a reshape changes the number of devices, ->degraded
-        * is measured against the large of the pre and post number of
+        * is measured against the larger of the pre and post number of
         * devices.*/
        if (mddev->delta_disks > 0) {
                spin_lock_irqsave(&conf->device_lock, flags);