]> bbs.cooldavid.org Git - net-next-2.6.git/blame - drivers/md/md.c
[PATCH] md: improve the interface to sync_request
[net-next-2.6.git] / drivers / md / md.c
CommitLineData
1da177e4
LT
1/*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5 completely rewritten, based on the MD driver code from Marc Zyngier
6
7 Changes:
8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
19
20 Neil Brown <neilb@cse.unsw.edu.au>.
21
22 This program is free software; you can redistribute it and/or modify
23 it under the terms of the GNU General Public License as published by
24 the Free Software Foundation; either version 2, or (at your option)
25 any later version.
26
27 You should have received a copy of the GNU General Public License
28 (for example /usr/src/linux/COPYING); if not, write to the Free
29 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
30*/
31
32#include <linux/module.h>
33#include <linux/config.h>
34#include <linux/linkage.h>
35#include <linux/raid/md.h>
36#include <linux/sysctl.h>
37#include <linux/devfs_fs_kernel.h>
38#include <linux/buffer_head.h> /* for invalidate_bdev */
39#include <linux/suspend.h>
40
41#include <linux/init.h>
42
43#ifdef CONFIG_KMOD
44#include <linux/kmod.h>
45#endif
46
47#include <asm/unaligned.h>
48
49#define MAJOR_NR MD_MAJOR
50#define MD_DRIVER
51
52/* 63 partitions with the alternate major number (mdp) */
53#define MdpMinorShift 6
54
55#define DEBUG 0
56#define dprintk(x...) ((void)(DEBUG && printk(x)))
57
58
59#ifndef MODULE
60static void autostart_arrays (int part);
61#endif
62
63static mdk_personality_t *pers[MAX_PERSONALITY];
64static DEFINE_SPINLOCK(pers_lock);
65
66/*
67 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
68 * is 1000 KB/sec, so the extra system load does not show up that much.
69 * Increase it if you want to have more _guaranteed_ speed. Note that
70 * the RAID driver will use the maximum available bandwith if the IO
71 * subsystem is idle. There is also an 'absolute maximum' reconstruction
72 * speed limit - in case reconstruction slows down your system despite
73 * idle IO detection.
74 *
75 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
76 */
77
78static int sysctl_speed_limit_min = 1000;
79static int sysctl_speed_limit_max = 200000;
80
81static struct ctl_table_header *raid_table_header;
82
83static ctl_table raid_table[] = {
84 {
85 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
86 .procname = "speed_limit_min",
87 .data = &sysctl_speed_limit_min,
88 .maxlen = sizeof(int),
89 .mode = 0644,
90 .proc_handler = &proc_dointvec,
91 },
92 {
93 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
94 .procname = "speed_limit_max",
95 .data = &sysctl_speed_limit_max,
96 .maxlen = sizeof(int),
97 .mode = 0644,
98 .proc_handler = &proc_dointvec,
99 },
100 { .ctl_name = 0 }
101};
102
103static ctl_table raid_dir_table[] = {
104 {
105 .ctl_name = DEV_RAID,
106 .procname = "raid",
107 .maxlen = 0,
108 .mode = 0555,
109 .child = raid_table,
110 },
111 { .ctl_name = 0 }
112};
113
114static ctl_table raid_root_table[] = {
115 {
116 .ctl_name = CTL_DEV,
117 .procname = "dev",
118 .maxlen = 0,
119 .mode = 0555,
120 .child = raid_dir_table,
121 },
122 { .ctl_name = 0 }
123};
124
125static struct block_device_operations md_fops;
126
127/*
128 * Enables to iterate over all existing md arrays
129 * all_mddevs_lock protects this list.
130 */
131static LIST_HEAD(all_mddevs);
132static DEFINE_SPINLOCK(all_mddevs_lock);
133
134
135/*
136 * iterates through all used mddevs in the system.
137 * We take care to grab the all_mddevs_lock whenever navigating
138 * the list, and to always hold a refcount when unlocked.
139 * Any code which breaks out of this loop while own
140 * a reference to the current mddev and must mddev_put it.
141 */
142#define ITERATE_MDDEV(mddev,tmp) \
143 \
144 for (({ spin_lock(&all_mddevs_lock); \
145 tmp = all_mddevs.next; \
146 mddev = NULL;}); \
147 ({ if (tmp != &all_mddevs) \
148 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
149 spin_unlock(&all_mddevs_lock); \
150 if (mddev) mddev_put(mddev); \
151 mddev = list_entry(tmp, mddev_t, all_mddevs); \
152 tmp != &all_mddevs;}); \
153 ({ spin_lock(&all_mddevs_lock); \
154 tmp = tmp->next;}) \
155 )
156
157
158static int md_fail_request (request_queue_t *q, struct bio *bio)
159{
160 bio_io_error(bio, bio->bi_size);
161 return 0;
162}
163
164static inline mddev_t *mddev_get(mddev_t *mddev)
165{
166 atomic_inc(&mddev->active);
167 return mddev;
168}
169
170static void mddev_put(mddev_t *mddev)
171{
172 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
173 return;
174 if (!mddev->raid_disks && list_empty(&mddev->disks)) {
175 list_del(&mddev->all_mddevs);
176 blk_put_queue(mddev->queue);
177 kfree(mddev);
178 }
179 spin_unlock(&all_mddevs_lock);
180}
181
182static mddev_t * mddev_find(dev_t unit)
183{
184 mddev_t *mddev, *new = NULL;
185
186 retry:
187 spin_lock(&all_mddevs_lock);
188 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
189 if (mddev->unit == unit) {
190 mddev_get(mddev);
191 spin_unlock(&all_mddevs_lock);
192 if (new)
193 kfree(new);
194 return mddev;
195 }
196
197 if (new) {
198 list_add(&new->all_mddevs, &all_mddevs);
199 spin_unlock(&all_mddevs_lock);
200 return new;
201 }
202 spin_unlock(&all_mddevs_lock);
203
204 new = (mddev_t *) kmalloc(sizeof(*new), GFP_KERNEL);
205 if (!new)
206 return NULL;
207
208 memset(new, 0, sizeof(*new));
209
210 new->unit = unit;
211 if (MAJOR(unit) == MD_MAJOR)
212 new->md_minor = MINOR(unit);
213 else
214 new->md_minor = MINOR(unit) >> MdpMinorShift;
215
216 init_MUTEX(&new->reconfig_sem);
217 INIT_LIST_HEAD(&new->disks);
218 INIT_LIST_HEAD(&new->all_mddevs);
219 init_timer(&new->safemode_timer);
220 atomic_set(&new->active, 1);
06d91a5f
N
221 bio_list_init(&new->write_list);
222 spin_lock_init(&new->write_lock);
1da177e4
LT
223
224 new->queue = blk_alloc_queue(GFP_KERNEL);
225 if (!new->queue) {
226 kfree(new);
227 return NULL;
228 }
229
230 blk_queue_make_request(new->queue, md_fail_request);
231
232 goto retry;
233}
234
235static inline int mddev_lock(mddev_t * mddev)
236{
237 return down_interruptible(&mddev->reconfig_sem);
238}
239
240static inline void mddev_lock_uninterruptible(mddev_t * mddev)
241{
242 down(&mddev->reconfig_sem);
243}
244
245static inline int mddev_trylock(mddev_t * mddev)
246{
247 return down_trylock(&mddev->reconfig_sem);
248}
249
250static inline void mddev_unlock(mddev_t * mddev)
251{
252 up(&mddev->reconfig_sem);
253
254 if (mddev->thread)
255 md_wakeup_thread(mddev->thread);
256}
257
258mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
259{
260 mdk_rdev_t * rdev;
261 struct list_head *tmp;
262
263 ITERATE_RDEV(mddev,rdev,tmp) {
264 if (rdev->desc_nr == nr)
265 return rdev;
266 }
267 return NULL;
268}
269
270static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
271{
272 struct list_head *tmp;
273 mdk_rdev_t *rdev;
274
275 ITERATE_RDEV(mddev,rdev,tmp) {
276 if (rdev->bdev->bd_dev == dev)
277 return rdev;
278 }
279 return NULL;
280}
281
282inline static sector_t calc_dev_sboffset(struct block_device *bdev)
283{
284 sector_t size = bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
285 return MD_NEW_SIZE_BLOCKS(size);
286}
287
288static sector_t calc_dev_size(mdk_rdev_t *rdev, unsigned chunk_size)
289{
290 sector_t size;
291
292 size = rdev->sb_offset;
293
294 if (chunk_size)
295 size &= ~((sector_t)chunk_size/1024 - 1);
296 return size;
297}
298
299static int alloc_disk_sb(mdk_rdev_t * rdev)
300{
301 if (rdev->sb_page)
302 MD_BUG();
303
304 rdev->sb_page = alloc_page(GFP_KERNEL);
305 if (!rdev->sb_page) {
306 printk(KERN_ALERT "md: out of memory.\n");
307 return -EINVAL;
308 }
309
310 return 0;
311}
312
313static void free_disk_sb(mdk_rdev_t * rdev)
314{
315 if (rdev->sb_page) {
316 page_cache_release(rdev->sb_page);
317 rdev->sb_loaded = 0;
318 rdev->sb_page = NULL;
319 rdev->sb_offset = 0;
320 rdev->size = 0;
321 }
322}
323
324
325static int bi_complete(struct bio *bio, unsigned int bytes_done, int error)
326{
327 if (bio->bi_size)
328 return 1;
329
330 complete((struct completion*)bio->bi_private);
331 return 0;
332}
333
334static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
335 struct page *page, int rw)
336{
baaa2c51 337 struct bio *bio = bio_alloc(GFP_NOIO, 1);
1da177e4
LT
338 struct completion event;
339 int ret;
340
341 rw |= (1 << BIO_RW_SYNC);
342
343 bio->bi_bdev = bdev;
344 bio->bi_sector = sector;
345 bio_add_page(bio, page, size, 0);
346 init_completion(&event);
347 bio->bi_private = &event;
348 bio->bi_end_io = bi_complete;
349 submit_bio(rw, bio);
350 wait_for_completion(&event);
351
352 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
353 bio_put(bio);
354 return ret;
355}
356
357static int read_disk_sb(mdk_rdev_t * rdev)
358{
359 char b[BDEVNAME_SIZE];
360 if (!rdev->sb_page) {
361 MD_BUG();
362 return -EINVAL;
363 }
364 if (rdev->sb_loaded)
365 return 0;
366
367
368 if (!sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
369 goto fail;
370 rdev->sb_loaded = 1;
371 return 0;
372
373fail:
374 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
375 bdevname(rdev->bdev,b));
376 return -EINVAL;
377}
378
379static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
380{
381 if ( (sb1->set_uuid0 == sb2->set_uuid0) &&
382 (sb1->set_uuid1 == sb2->set_uuid1) &&
383 (sb1->set_uuid2 == sb2->set_uuid2) &&
384 (sb1->set_uuid3 == sb2->set_uuid3))
385
386 return 1;
387
388 return 0;
389}
390
391
392static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
393{
394 int ret;
395 mdp_super_t *tmp1, *tmp2;
396
397 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
398 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
399
400 if (!tmp1 || !tmp2) {
401 ret = 0;
402 printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
403 goto abort;
404 }
405
406 *tmp1 = *sb1;
407 *tmp2 = *sb2;
408
409 /*
410 * nr_disks is not constant
411 */
412 tmp1->nr_disks = 0;
413 tmp2->nr_disks = 0;
414
415 if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
416 ret = 0;
417 else
418 ret = 1;
419
420abort:
421 if (tmp1)
422 kfree(tmp1);
423 if (tmp2)
424 kfree(tmp2);
425
426 return ret;
427}
428
429static unsigned int calc_sb_csum(mdp_super_t * sb)
430{
431 unsigned int disk_csum, csum;
432
433 disk_csum = sb->sb_csum;
434 sb->sb_csum = 0;
435 csum = csum_partial((void *)sb, MD_SB_BYTES, 0);
436 sb->sb_csum = disk_csum;
437 return csum;
438}
439
440
441/*
442 * Handle superblock details.
443 * We want to be able to handle multiple superblock formats
444 * so we have a common interface to them all, and an array of
445 * different handlers.
446 * We rely on user-space to write the initial superblock, and support
447 * reading and updating of superblocks.
448 * Interface methods are:
449 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
450 * loads and validates a superblock on dev.
451 * if refdev != NULL, compare superblocks on both devices
452 * Return:
453 * 0 - dev has a superblock that is compatible with refdev
454 * 1 - dev has a superblock that is compatible and newer than refdev
455 * so dev should be used as the refdev in future
456 * -EINVAL superblock incompatible or invalid
457 * -othererror e.g. -EIO
458 *
459 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
460 * Verify that dev is acceptable into mddev.
461 * The first time, mddev->raid_disks will be 0, and data from
462 * dev should be merged in. Subsequent calls check that dev
463 * is new enough. Return 0 or -EINVAL
464 *
465 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
466 * Update the superblock for rdev with data in mddev
467 * This does not write to disc.
468 *
469 */
470
471struct super_type {
472 char *name;
473 struct module *owner;
474 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version);
475 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
476 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
477};
478
479/*
480 * load_super for 0.90.0
481 */
482static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
483{
484 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
485 mdp_super_t *sb;
486 int ret;
487 sector_t sb_offset;
488
489 /*
490 * Calculate the position of the superblock,
491 * it's at the end of the disk.
492 *
493 * It also happens to be a multiple of 4Kb.
494 */
495 sb_offset = calc_dev_sboffset(rdev->bdev);
496 rdev->sb_offset = sb_offset;
497
498 ret = read_disk_sb(rdev);
499 if (ret) return ret;
500
501 ret = -EINVAL;
502
503 bdevname(rdev->bdev, b);
504 sb = (mdp_super_t*)page_address(rdev->sb_page);
505
506 if (sb->md_magic != MD_SB_MAGIC) {
507 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
508 b);
509 goto abort;
510 }
511
512 if (sb->major_version != 0 ||
513 sb->minor_version != 90) {
514 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
515 sb->major_version, sb->minor_version,
516 b);
517 goto abort;
518 }
519
520 if (sb->raid_disks <= 0)
521 goto abort;
522
523 if (csum_fold(calc_sb_csum(sb)) != csum_fold(sb->sb_csum)) {
524 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
525 b);
526 goto abort;
527 }
528
529 rdev->preferred_minor = sb->md_minor;
530 rdev->data_offset = 0;
531
532 if (sb->level == LEVEL_MULTIPATH)
533 rdev->desc_nr = -1;
534 else
535 rdev->desc_nr = sb->this_disk.number;
536
537 if (refdev == 0)
538 ret = 1;
539 else {
540 __u64 ev1, ev2;
541 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
542 if (!uuid_equal(refsb, sb)) {
543 printk(KERN_WARNING "md: %s has different UUID to %s\n",
544 b, bdevname(refdev->bdev,b2));
545 goto abort;
546 }
547 if (!sb_equal(refsb, sb)) {
548 printk(KERN_WARNING "md: %s has same UUID"
549 " but different superblock to %s\n",
550 b, bdevname(refdev->bdev, b2));
551 goto abort;
552 }
553 ev1 = md_event(sb);
554 ev2 = md_event(refsb);
555 if (ev1 > ev2)
556 ret = 1;
557 else
558 ret = 0;
559 }
560 rdev->size = calc_dev_size(rdev, sb->chunk_size);
561
562 abort:
563 return ret;
564}
565
566/*
567 * validate_super for 0.90.0
568 */
569static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
570{
571 mdp_disk_t *desc;
572 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
573
574 if (mddev->raid_disks == 0) {
575 mddev->major_version = 0;
576 mddev->minor_version = sb->minor_version;
577 mddev->patch_version = sb->patch_version;
578 mddev->persistent = ! sb->not_persistent;
579 mddev->chunk_size = sb->chunk_size;
580 mddev->ctime = sb->ctime;
581 mddev->utime = sb->utime;
582 mddev->level = sb->level;
583 mddev->layout = sb->layout;
584 mddev->raid_disks = sb->raid_disks;
585 mddev->size = sb->size;
586 mddev->events = md_event(sb);
587
588 if (sb->state & (1<<MD_SB_CLEAN))
589 mddev->recovery_cp = MaxSector;
590 else {
591 if (sb->events_hi == sb->cp_events_hi &&
592 sb->events_lo == sb->cp_events_lo) {
593 mddev->recovery_cp = sb->recovery_cp;
594 } else
595 mddev->recovery_cp = 0;
596 }
597
598 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
599 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
600 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
601 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
602
603 mddev->max_disks = MD_SB_DISKS;
604 } else {
605 __u64 ev1;
606 ev1 = md_event(sb);
607 ++ev1;
608 if (ev1 < mddev->events)
609 return -EINVAL;
610 }
611 if (mddev->level != LEVEL_MULTIPATH) {
612 rdev->raid_disk = -1;
613 rdev->in_sync = rdev->faulty = 0;
614 desc = sb->disks + rdev->desc_nr;
615
616 if (desc->state & (1<<MD_DISK_FAULTY))
617 rdev->faulty = 1;
618 else if (desc->state & (1<<MD_DISK_SYNC) &&
619 desc->raid_disk < mddev->raid_disks) {
620 rdev->in_sync = 1;
621 rdev->raid_disk = desc->raid_disk;
622 }
623 }
624 return 0;
625}
626
627/*
628 * sync_super for 0.90.0
629 */
630static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
631{
632 mdp_super_t *sb;
633 struct list_head *tmp;
634 mdk_rdev_t *rdev2;
635 int next_spare = mddev->raid_disks;
636
637 /* make rdev->sb match mddev data..
638 *
639 * 1/ zero out disks
640 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
641 * 3/ any empty disks < next_spare become removed
642 *
643 * disks[0] gets initialised to REMOVED because
644 * we cannot be sure from other fields if it has
645 * been initialised or not.
646 */
647 int i;
648 int active=0, working=0,failed=0,spare=0,nr_disks=0;
649
650 sb = (mdp_super_t*)page_address(rdev->sb_page);
651
652 memset(sb, 0, sizeof(*sb));
653
654 sb->md_magic = MD_SB_MAGIC;
655 sb->major_version = mddev->major_version;
656 sb->minor_version = mddev->minor_version;
657 sb->patch_version = mddev->patch_version;
658 sb->gvalid_words = 0; /* ignored */
659 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
660 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
661 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
662 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
663
664 sb->ctime = mddev->ctime;
665 sb->level = mddev->level;
666 sb->size = mddev->size;
667 sb->raid_disks = mddev->raid_disks;
668 sb->md_minor = mddev->md_minor;
669 sb->not_persistent = !mddev->persistent;
670 sb->utime = mddev->utime;
671 sb->state = 0;
672 sb->events_hi = (mddev->events>>32);
673 sb->events_lo = (u32)mddev->events;
674
675 if (mddev->in_sync)
676 {
677 sb->recovery_cp = mddev->recovery_cp;
678 sb->cp_events_hi = (mddev->events>>32);
679 sb->cp_events_lo = (u32)mddev->events;
680 if (mddev->recovery_cp == MaxSector)
681 sb->state = (1<< MD_SB_CLEAN);
682 } else
683 sb->recovery_cp = 0;
684
685 sb->layout = mddev->layout;
686 sb->chunk_size = mddev->chunk_size;
687
688 sb->disks[0].state = (1<<MD_DISK_REMOVED);
689 ITERATE_RDEV(mddev,rdev2,tmp) {
690 mdp_disk_t *d;
691 if (rdev2->raid_disk >= 0 && rdev2->in_sync && !rdev2->faulty)
692 rdev2->desc_nr = rdev2->raid_disk;
693 else
694 rdev2->desc_nr = next_spare++;
695 d = &sb->disks[rdev2->desc_nr];
696 nr_disks++;
697 d->number = rdev2->desc_nr;
698 d->major = MAJOR(rdev2->bdev->bd_dev);
699 d->minor = MINOR(rdev2->bdev->bd_dev);
700 if (rdev2->raid_disk >= 0 && rdev->in_sync && !rdev2->faulty)
701 d->raid_disk = rdev2->raid_disk;
702 else
703 d->raid_disk = rdev2->desc_nr; /* compatibility */
704 if (rdev2->faulty) {
705 d->state = (1<<MD_DISK_FAULTY);
706 failed++;
707 } else if (rdev2->in_sync) {
708 d->state = (1<<MD_DISK_ACTIVE);
709 d->state |= (1<<MD_DISK_SYNC);
710 active++;
711 working++;
712 } else {
713 d->state = 0;
714 spare++;
715 working++;
716 }
717 }
718
719 /* now set the "removed" and "faulty" bits on any missing devices */
720 for (i=0 ; i < mddev->raid_disks ; i++) {
721 mdp_disk_t *d = &sb->disks[i];
722 if (d->state == 0 && d->number == 0) {
723 d->number = i;
724 d->raid_disk = i;
725 d->state = (1<<MD_DISK_REMOVED);
726 d->state |= (1<<MD_DISK_FAULTY);
727 failed++;
728 }
729 }
730 sb->nr_disks = nr_disks;
731 sb->active_disks = active;
732 sb->working_disks = working;
733 sb->failed_disks = failed;
734 sb->spare_disks = spare;
735
736 sb->this_disk = sb->disks[rdev->desc_nr];
737 sb->sb_csum = calc_sb_csum(sb);
738}
739
740/*
741 * version 1 superblock
742 */
743
744static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
745{
746 unsigned int disk_csum, csum;
747 unsigned long long newcsum;
748 int size = 256 + le32_to_cpu(sb->max_dev)*2;
749 unsigned int *isuper = (unsigned int*)sb;
750 int i;
751
752 disk_csum = sb->sb_csum;
753 sb->sb_csum = 0;
754 newcsum = 0;
755 for (i=0; size>=4; size -= 4 )
756 newcsum += le32_to_cpu(*isuper++);
757
758 if (size == 2)
759 newcsum += le16_to_cpu(*(unsigned short*) isuper);
760
761 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
762 sb->sb_csum = disk_csum;
763 return cpu_to_le32(csum);
764}
765
766static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
767{
768 struct mdp_superblock_1 *sb;
769 int ret;
770 sector_t sb_offset;
771 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
772
773 /*
774 * Calculate the position of the superblock.
775 * It is always aligned to a 4K boundary and
776 * depeding on minor_version, it can be:
777 * 0: At least 8K, but less than 12K, from end of device
778 * 1: At start of device
779 * 2: 4K from start of device.
780 */
781 switch(minor_version) {
782 case 0:
783 sb_offset = rdev->bdev->bd_inode->i_size >> 9;
784 sb_offset -= 8*2;
785 sb_offset &= ~(4*2-1);
786 /* convert from sectors to K */
787 sb_offset /= 2;
788 break;
789 case 1:
790 sb_offset = 0;
791 break;
792 case 2:
793 sb_offset = 4;
794 break;
795 default:
796 return -EINVAL;
797 }
798 rdev->sb_offset = sb_offset;
799
800 ret = read_disk_sb(rdev);
801 if (ret) return ret;
802
803
804 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
805
806 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
807 sb->major_version != cpu_to_le32(1) ||
808 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
809 le64_to_cpu(sb->super_offset) != (rdev->sb_offset<<1) ||
810 sb->feature_map != 0)
811 return -EINVAL;
812
813 if (calc_sb_1_csum(sb) != sb->sb_csum) {
814 printk("md: invalid superblock checksum on %s\n",
815 bdevname(rdev->bdev,b));
816 return -EINVAL;
817 }
818 if (le64_to_cpu(sb->data_size) < 10) {
819 printk("md: data_size too small on %s\n",
820 bdevname(rdev->bdev,b));
821 return -EINVAL;
822 }
823 rdev->preferred_minor = 0xffff;
824 rdev->data_offset = le64_to_cpu(sb->data_offset);
825
826 if (refdev == 0)
827 return 1;
828 else {
829 __u64 ev1, ev2;
830 struct mdp_superblock_1 *refsb =
831 (struct mdp_superblock_1*)page_address(refdev->sb_page);
832
833 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
834 sb->level != refsb->level ||
835 sb->layout != refsb->layout ||
836 sb->chunksize != refsb->chunksize) {
837 printk(KERN_WARNING "md: %s has strangely different"
838 " superblock to %s\n",
839 bdevname(rdev->bdev,b),
840 bdevname(refdev->bdev,b2));
841 return -EINVAL;
842 }
843 ev1 = le64_to_cpu(sb->events);
844 ev2 = le64_to_cpu(refsb->events);
845
846 if (ev1 > ev2)
847 return 1;
848 }
849 if (minor_version)
850 rdev->size = ((rdev->bdev->bd_inode->i_size>>9) - le64_to_cpu(sb->data_offset)) / 2;
851 else
852 rdev->size = rdev->sb_offset;
853 if (rdev->size < le64_to_cpu(sb->data_size)/2)
854 return -EINVAL;
855 rdev->size = le64_to_cpu(sb->data_size)/2;
856 if (le32_to_cpu(sb->chunksize))
857 rdev->size &= ~((sector_t)le32_to_cpu(sb->chunksize)/2 - 1);
858 return 0;
859}
860
861static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
862{
863 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
864
865 if (mddev->raid_disks == 0) {
866 mddev->major_version = 1;
867 mddev->patch_version = 0;
868 mddev->persistent = 1;
869 mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
870 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
871 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
872 mddev->level = le32_to_cpu(sb->level);
873 mddev->layout = le32_to_cpu(sb->layout);
874 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
875 mddev->size = le64_to_cpu(sb->size)/2;
876 mddev->events = le64_to_cpu(sb->events);
877
878 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
879 memcpy(mddev->uuid, sb->set_uuid, 16);
880
881 mddev->max_disks = (4096-256)/2;
882 } else {
883 __u64 ev1;
884 ev1 = le64_to_cpu(sb->events);
885 ++ev1;
886 if (ev1 < mddev->events)
887 return -EINVAL;
888 }
889
890 if (mddev->level != LEVEL_MULTIPATH) {
891 int role;
892 rdev->desc_nr = le32_to_cpu(sb->dev_number);
893 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
894 switch(role) {
895 case 0xffff: /* spare */
896 rdev->in_sync = 0;
897 rdev->faulty = 0;
898 rdev->raid_disk = -1;
899 break;
900 case 0xfffe: /* faulty */
901 rdev->in_sync = 0;
902 rdev->faulty = 1;
903 rdev->raid_disk = -1;
904 break;
905 default:
906 rdev->in_sync = 1;
907 rdev->faulty = 0;
908 rdev->raid_disk = role;
909 break;
910 }
911 }
912 return 0;
913}
914
915static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
916{
917 struct mdp_superblock_1 *sb;
918 struct list_head *tmp;
919 mdk_rdev_t *rdev2;
920 int max_dev, i;
921 /* make rdev->sb match mddev and rdev data. */
922
923 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
924
925 sb->feature_map = 0;
926 sb->pad0 = 0;
927 memset(sb->pad1, 0, sizeof(sb->pad1));
928 memset(sb->pad2, 0, sizeof(sb->pad2));
929 memset(sb->pad3, 0, sizeof(sb->pad3));
930
931 sb->utime = cpu_to_le64((__u64)mddev->utime);
932 sb->events = cpu_to_le64(mddev->events);
933 if (mddev->in_sync)
934 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
935 else
936 sb->resync_offset = cpu_to_le64(0);
937
938 max_dev = 0;
939 ITERATE_RDEV(mddev,rdev2,tmp)
940 if (rdev2->desc_nr+1 > max_dev)
941 max_dev = rdev2->desc_nr+1;
942
943 sb->max_dev = cpu_to_le32(max_dev);
944 for (i=0; i<max_dev;i++)
945 sb->dev_roles[i] = cpu_to_le16(0xfffe);
946
947 ITERATE_RDEV(mddev,rdev2,tmp) {
948 i = rdev2->desc_nr;
949 if (rdev2->faulty)
950 sb->dev_roles[i] = cpu_to_le16(0xfffe);
951 else if (rdev2->in_sync)
952 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
953 else
954 sb->dev_roles[i] = cpu_to_le16(0xffff);
955 }
956
957 sb->recovery_offset = cpu_to_le64(0); /* not supported yet */
958 sb->sb_csum = calc_sb_1_csum(sb);
959}
960
961
75c96f85 962static struct super_type super_types[] = {
1da177e4
LT
963 [0] = {
964 .name = "0.90.0",
965 .owner = THIS_MODULE,
966 .load_super = super_90_load,
967 .validate_super = super_90_validate,
968 .sync_super = super_90_sync,
969 },
970 [1] = {
971 .name = "md-1",
972 .owner = THIS_MODULE,
973 .load_super = super_1_load,
974 .validate_super = super_1_validate,
975 .sync_super = super_1_sync,
976 },
977};
978
979static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
980{
981 struct list_head *tmp;
982 mdk_rdev_t *rdev;
983
984 ITERATE_RDEV(mddev,rdev,tmp)
985 if (rdev->bdev->bd_contains == dev->bdev->bd_contains)
986 return rdev;
987
988 return NULL;
989}
990
991static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
992{
993 struct list_head *tmp;
994 mdk_rdev_t *rdev;
995
996 ITERATE_RDEV(mddev1,rdev,tmp)
997 if (match_dev_unit(mddev2, rdev))
998 return 1;
999
1000 return 0;
1001}
1002
1003static LIST_HEAD(pending_raid_disks);
1004
1005static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1006{
1007 mdk_rdev_t *same_pdev;
1008 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1009
1010 if (rdev->mddev) {
1011 MD_BUG();
1012 return -EINVAL;
1013 }
1014 same_pdev = match_dev_unit(mddev, rdev);
1015 if (same_pdev)
1016 printk(KERN_WARNING
1017 "%s: WARNING: %s appears to be on the same physical"
1018 " disk as %s. True\n protection against single-disk"
1019 " failure might be compromised.\n",
1020 mdname(mddev), bdevname(rdev->bdev,b),
1021 bdevname(same_pdev->bdev,b2));
1022
1023 /* Verify rdev->desc_nr is unique.
1024 * If it is -1, assign a free number, else
1025 * check number is not in use
1026 */
1027 if (rdev->desc_nr < 0) {
1028 int choice = 0;
1029 if (mddev->pers) choice = mddev->raid_disks;
1030 while (find_rdev_nr(mddev, choice))
1031 choice++;
1032 rdev->desc_nr = choice;
1033 } else {
1034 if (find_rdev_nr(mddev, rdev->desc_nr))
1035 return -EBUSY;
1036 }
1037
1038 list_add(&rdev->same_set, &mddev->disks);
1039 rdev->mddev = mddev;
1040 printk(KERN_INFO "md: bind<%s>\n", bdevname(rdev->bdev,b));
1041 return 0;
1042}
1043
1044static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1045{
1046 char b[BDEVNAME_SIZE];
1047 if (!rdev->mddev) {
1048 MD_BUG();
1049 return;
1050 }
1051 list_del_init(&rdev->same_set);
1052 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1053 rdev->mddev = NULL;
1054}
1055
1056/*
1057 * prevent the device from being mounted, repartitioned or
1058 * otherwise reused by a RAID array (or any other kernel
1059 * subsystem), by bd_claiming the device.
1060 */
1061static int lock_rdev(mdk_rdev_t *rdev, dev_t dev)
1062{
1063 int err = 0;
1064 struct block_device *bdev;
1065 char b[BDEVNAME_SIZE];
1066
1067 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1068 if (IS_ERR(bdev)) {
1069 printk(KERN_ERR "md: could not open %s.\n",
1070 __bdevname(dev, b));
1071 return PTR_ERR(bdev);
1072 }
1073 err = bd_claim(bdev, rdev);
1074 if (err) {
1075 printk(KERN_ERR "md: could not bd_claim %s.\n",
1076 bdevname(bdev, b));
1077 blkdev_put(bdev);
1078 return err;
1079 }
1080 rdev->bdev = bdev;
1081 return err;
1082}
1083
1084static void unlock_rdev(mdk_rdev_t *rdev)
1085{
1086 struct block_device *bdev = rdev->bdev;
1087 rdev->bdev = NULL;
1088 if (!bdev)
1089 MD_BUG();
1090 bd_release(bdev);
1091 blkdev_put(bdev);
1092}
1093
1094void md_autodetect_dev(dev_t dev);
1095
1096static void export_rdev(mdk_rdev_t * rdev)
1097{
1098 char b[BDEVNAME_SIZE];
1099 printk(KERN_INFO "md: export_rdev(%s)\n",
1100 bdevname(rdev->bdev,b));
1101 if (rdev->mddev)
1102 MD_BUG();
1103 free_disk_sb(rdev);
1104 list_del_init(&rdev->same_set);
1105#ifndef MODULE
1106 md_autodetect_dev(rdev->bdev->bd_dev);
1107#endif
1108 unlock_rdev(rdev);
1109 kfree(rdev);
1110}
1111
1112static void kick_rdev_from_array(mdk_rdev_t * rdev)
1113{
1114 unbind_rdev_from_array(rdev);
1115 export_rdev(rdev);
1116}
1117
1118static void export_array(mddev_t *mddev)
1119{
1120 struct list_head *tmp;
1121 mdk_rdev_t *rdev;
1122
1123 ITERATE_RDEV(mddev,rdev,tmp) {
1124 if (!rdev->mddev) {
1125 MD_BUG();
1126 continue;
1127 }
1128 kick_rdev_from_array(rdev);
1129 }
1130 if (!list_empty(&mddev->disks))
1131 MD_BUG();
1132 mddev->raid_disks = 0;
1133 mddev->major_version = 0;
1134}
1135
1136static void print_desc(mdp_disk_t *desc)
1137{
1138 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1139 desc->major,desc->minor,desc->raid_disk,desc->state);
1140}
1141
1142static void print_sb(mdp_super_t *sb)
1143{
1144 int i;
1145
1146 printk(KERN_INFO
1147 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1148 sb->major_version, sb->minor_version, sb->patch_version,
1149 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1150 sb->ctime);
1151 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1152 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1153 sb->md_minor, sb->layout, sb->chunk_size);
1154 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
1155 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1156 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1157 sb->failed_disks, sb->spare_disks,
1158 sb->sb_csum, (unsigned long)sb->events_lo);
1159
1160 printk(KERN_INFO);
1161 for (i = 0; i < MD_SB_DISKS; i++) {
1162 mdp_disk_t *desc;
1163
1164 desc = sb->disks + i;
1165 if (desc->number || desc->major || desc->minor ||
1166 desc->raid_disk || (desc->state && (desc->state != 4))) {
1167 printk(" D %2d: ", i);
1168 print_desc(desc);
1169 }
1170 }
1171 printk(KERN_INFO "md: THIS: ");
1172 print_desc(&sb->this_disk);
1173
1174}
1175
1176static void print_rdev(mdk_rdev_t *rdev)
1177{
1178 char b[BDEVNAME_SIZE];
1179 printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%u\n",
1180 bdevname(rdev->bdev,b), (unsigned long long)rdev->size,
1181 rdev->faulty, rdev->in_sync, rdev->desc_nr);
1182 if (rdev->sb_loaded) {
1183 printk(KERN_INFO "md: rdev superblock:\n");
1184 print_sb((mdp_super_t*)page_address(rdev->sb_page));
1185 } else
1186 printk(KERN_INFO "md: no rdev superblock!\n");
1187}
1188
1189void md_print_devices(void)
1190{
1191 struct list_head *tmp, *tmp2;
1192 mdk_rdev_t *rdev;
1193 mddev_t *mddev;
1194 char b[BDEVNAME_SIZE];
1195
1196 printk("\n");
1197 printk("md: **********************************\n");
1198 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1199 printk("md: **********************************\n");
1200 ITERATE_MDDEV(mddev,tmp) {
1201 printk("%s: ", mdname(mddev));
1202
1203 ITERATE_RDEV(mddev,rdev,tmp2)
1204 printk("<%s>", bdevname(rdev->bdev,b));
1205 printk("\n");
1206
1207 ITERATE_RDEV(mddev,rdev,tmp2)
1208 print_rdev(rdev);
1209 }
1210 printk("md: **********************************\n");
1211 printk("\n");
1212}
1213
1214
1215static int write_disk_sb(mdk_rdev_t * rdev)
1216{
1217 char b[BDEVNAME_SIZE];
1218 if (!rdev->sb_loaded) {
1219 MD_BUG();
1220 return 1;
1221 }
1222 if (rdev->faulty) {
1223 MD_BUG();
1224 return 1;
1225 }
1226
1227 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
1228 bdevname(rdev->bdev,b),
1229 (unsigned long long)rdev->sb_offset);
1230
1231 if (sync_page_io(rdev->bdev, rdev->sb_offset<<1, MD_SB_BYTES, rdev->sb_page, WRITE))
1232 return 0;
1233
1234 printk("md: write_disk_sb failed for device %s\n",
1235 bdevname(rdev->bdev,b));
1236 return 1;
1237}
1238
1239static void sync_sbs(mddev_t * mddev)
1240{
1241 mdk_rdev_t *rdev;
1242 struct list_head *tmp;
1243
1244 ITERATE_RDEV(mddev,rdev,tmp) {
1245 super_types[mddev->major_version].
1246 sync_super(mddev, rdev);
1247 rdev->sb_loaded = 1;
1248 }
1249}
1250
1251static void md_update_sb(mddev_t * mddev)
1252{
1253 int err, count = 100;
1254 struct list_head *tmp;
1255 mdk_rdev_t *rdev;
06d91a5f 1256 int sync_req;
1da177e4 1257
1da177e4 1258repeat:
06d91a5f
N
1259 spin_lock(&mddev->write_lock);
1260 sync_req = mddev->in_sync;
1da177e4
LT
1261 mddev->utime = get_seconds();
1262 mddev->events ++;
1263
1264 if (!mddev->events) {
1265 /*
1266 * oops, this 64-bit counter should never wrap.
1267 * Either we are in around ~1 trillion A.C., assuming
1268 * 1 reboot per second, or we have a bug:
1269 */
1270 MD_BUG();
1271 mddev->events --;
1272 }
1273 sync_sbs(mddev);
1274
1275 /*
1276 * do not write anything to disk if using
1277 * nonpersistent superblocks
1278 */
06d91a5f
N
1279 if (!mddev->persistent) {
1280 mddev->sb_dirty = 0;
1281 spin_unlock(&mddev->write_lock);
1da177e4 1282 return;
06d91a5f
N
1283 }
1284 spin_unlock(&mddev->write_lock);
1da177e4
LT
1285
1286 dprintk(KERN_INFO
1287 "md: updating %s RAID superblock on device (in sync %d)\n",
1288 mdname(mddev),mddev->in_sync);
1289
1290 err = 0;
1291 ITERATE_RDEV(mddev,rdev,tmp) {
1292 char b[BDEVNAME_SIZE];
1293 dprintk(KERN_INFO "md: ");
1294 if (rdev->faulty)
1295 dprintk("(skipping faulty ");
1296
1297 dprintk("%s ", bdevname(rdev->bdev,b));
1298 if (!rdev->faulty) {
1299 err += write_disk_sb(rdev);
1300 } else
1301 dprintk(")\n");
1302 if (!err && mddev->level == LEVEL_MULTIPATH)
1303 /* only need to write one superblock... */
1304 break;
1305 }
1306 if (err) {
1307 if (--count) {
1308 printk(KERN_ERR "md: errors occurred during superblock"
1309 " update, repeating\n");
1310 goto repeat;
1311 }
1312 printk(KERN_ERR \
1313 "md: excessive errors occurred during superblock update, exiting\n");
1314 }
06d91a5f
N
1315 spin_lock(&mddev->write_lock);
1316 if (mddev->in_sync != sync_req) {
1317 /* have to write it out again */
1318 spin_unlock(&mddev->write_lock);
1319 goto repeat;
1320 }
1321 mddev->sb_dirty = 0;
1322 spin_unlock(&mddev->write_lock);
1323
1da177e4
LT
1324}
1325
1326/*
1327 * Import a device. If 'super_format' >= 0, then sanity check the superblock
1328 *
1329 * mark the device faulty if:
1330 *
1331 * - the device is nonexistent (zero size)
1332 * - the device has no valid superblock
1333 *
1334 * a faulty rdev _never_ has rdev->sb set.
1335 */
1336static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
1337{
1338 char b[BDEVNAME_SIZE];
1339 int err;
1340 mdk_rdev_t *rdev;
1341 sector_t size;
1342
1343 rdev = (mdk_rdev_t *) kmalloc(sizeof(*rdev), GFP_KERNEL);
1344 if (!rdev) {
1345 printk(KERN_ERR "md: could not alloc mem for new device!\n");
1346 return ERR_PTR(-ENOMEM);
1347 }
1348 memset(rdev, 0, sizeof(*rdev));
1349
1350 if ((err = alloc_disk_sb(rdev)))
1351 goto abort_free;
1352
1353 err = lock_rdev(rdev, newdev);
1354 if (err)
1355 goto abort_free;
1356
1357 rdev->desc_nr = -1;
1358 rdev->faulty = 0;
1359 rdev->in_sync = 0;
1360 rdev->data_offset = 0;
1361 atomic_set(&rdev->nr_pending, 0);
1362
1363 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
1364 if (!size) {
1365 printk(KERN_WARNING
1366 "md: %s has zero or unknown size, marking faulty!\n",
1367 bdevname(rdev->bdev,b));
1368 err = -EINVAL;
1369 goto abort_free;
1370 }
1371
1372 if (super_format >= 0) {
1373 err = super_types[super_format].
1374 load_super(rdev, NULL, super_minor);
1375 if (err == -EINVAL) {
1376 printk(KERN_WARNING
1377 "md: %s has invalid sb, not importing!\n",
1378 bdevname(rdev->bdev,b));
1379 goto abort_free;
1380 }
1381 if (err < 0) {
1382 printk(KERN_WARNING
1383 "md: could not read %s's sb, not importing!\n",
1384 bdevname(rdev->bdev,b));
1385 goto abort_free;
1386 }
1387 }
1388 INIT_LIST_HEAD(&rdev->same_set);
1389
1390 return rdev;
1391
1392abort_free:
1393 if (rdev->sb_page) {
1394 if (rdev->bdev)
1395 unlock_rdev(rdev);
1396 free_disk_sb(rdev);
1397 }
1398 kfree(rdev);
1399 return ERR_PTR(err);
1400}
1401
1402/*
1403 * Check a full RAID array for plausibility
1404 */
1405
1406
a757e64c 1407static void analyze_sbs(mddev_t * mddev)
1da177e4
LT
1408{
1409 int i;
1410 struct list_head *tmp;
1411 mdk_rdev_t *rdev, *freshest;
1412 char b[BDEVNAME_SIZE];
1413
1414 freshest = NULL;
1415 ITERATE_RDEV(mddev,rdev,tmp)
1416 switch (super_types[mddev->major_version].
1417 load_super(rdev, freshest, mddev->minor_version)) {
1418 case 1:
1419 freshest = rdev;
1420 break;
1421 case 0:
1422 break;
1423 default:
1424 printk( KERN_ERR \
1425 "md: fatal superblock inconsistency in %s"
1426 " -- removing from array\n",
1427 bdevname(rdev->bdev,b));
1428 kick_rdev_from_array(rdev);
1429 }
1430
1431
1432 super_types[mddev->major_version].
1433 validate_super(mddev, freshest);
1434
1435 i = 0;
1436 ITERATE_RDEV(mddev,rdev,tmp) {
1437 if (rdev != freshest)
1438 if (super_types[mddev->major_version].
1439 validate_super(mddev, rdev)) {
1440 printk(KERN_WARNING "md: kicking non-fresh %s"
1441 " from array!\n",
1442 bdevname(rdev->bdev,b));
1443 kick_rdev_from_array(rdev);
1444 continue;
1445 }
1446 if (mddev->level == LEVEL_MULTIPATH) {
1447 rdev->desc_nr = i++;
1448 rdev->raid_disk = rdev->desc_nr;
1449 rdev->in_sync = 1;
1450 }
1451 }
1452
1453
1454
1455 if (mddev->recovery_cp != MaxSector &&
1456 mddev->level >= 1)
1457 printk(KERN_ERR "md: %s: raid array is not clean"
1458 " -- starting background reconstruction\n",
1459 mdname(mddev));
1460
1da177e4
LT
1461}
1462
1463int mdp_major = 0;
1464
1465static struct kobject *md_probe(dev_t dev, int *part, void *data)
1466{
1467 static DECLARE_MUTEX(disks_sem);
1468 mddev_t *mddev = mddev_find(dev);
1469 struct gendisk *disk;
1470 int partitioned = (MAJOR(dev) != MD_MAJOR);
1471 int shift = partitioned ? MdpMinorShift : 0;
1472 int unit = MINOR(dev) >> shift;
1473
1474 if (!mddev)
1475 return NULL;
1476
1477 down(&disks_sem);
1478 if (mddev->gendisk) {
1479 up(&disks_sem);
1480 mddev_put(mddev);
1481 return NULL;
1482 }
1483 disk = alloc_disk(1 << shift);
1484 if (!disk) {
1485 up(&disks_sem);
1486 mddev_put(mddev);
1487 return NULL;
1488 }
1489 disk->major = MAJOR(dev);
1490 disk->first_minor = unit << shift;
1491 if (partitioned) {
1492 sprintf(disk->disk_name, "md_d%d", unit);
1493 sprintf(disk->devfs_name, "md/d%d", unit);
1494 } else {
1495 sprintf(disk->disk_name, "md%d", unit);
1496 sprintf(disk->devfs_name, "md/%d", unit);
1497 }
1498 disk->fops = &md_fops;
1499 disk->private_data = mddev;
1500 disk->queue = mddev->queue;
1501 add_disk(disk);
1502 mddev->gendisk = disk;
1503 up(&disks_sem);
1504 return NULL;
1505}
1506
1507void md_wakeup_thread(mdk_thread_t *thread);
1508
1509static void md_safemode_timeout(unsigned long data)
1510{
1511 mddev_t *mddev = (mddev_t *) data;
1512
1513 mddev->safemode = 1;
1514 md_wakeup_thread(mddev->thread);
1515}
1516
1517
1518static int do_md_run(mddev_t * mddev)
1519{
1520 int pnum, err;
1521 int chunk_size;
1522 struct list_head *tmp;
1523 mdk_rdev_t *rdev;
1524 struct gendisk *disk;
1525 char b[BDEVNAME_SIZE];
1526
a757e64c
N
1527 if (list_empty(&mddev->disks))
1528 /* cannot run an array with no devices.. */
1da177e4 1529 return -EINVAL;
1da177e4
LT
1530
1531 if (mddev->pers)
1532 return -EBUSY;
1533
1534 /*
1535 * Analyze all RAID superblock(s)
1536 */
a757e64c
N
1537 if (!mddev->raid_disks)
1538 analyze_sbs(mddev);
1da177e4
LT
1539
1540 chunk_size = mddev->chunk_size;
1541 pnum = level_to_pers(mddev->level);
1542
1543 if ((pnum != MULTIPATH) && (pnum != RAID1)) {
1544 if (!chunk_size) {
1545 /*
1546 * 'default chunksize' in the old md code used to
1547 * be PAGE_SIZE, baaad.
1548 * we abort here to be on the safe side. We don't
1549 * want to continue the bad practice.
1550 */
1551 printk(KERN_ERR
1552 "no chunksize specified, see 'man raidtab'\n");
1553 return -EINVAL;
1554 }
1555 if (chunk_size > MAX_CHUNK_SIZE) {
1556 printk(KERN_ERR "too big chunk_size: %d > %d\n",
1557 chunk_size, MAX_CHUNK_SIZE);
1558 return -EINVAL;
1559 }
1560 /*
1561 * chunk-size has to be a power of 2 and multiples of PAGE_SIZE
1562 */
1563 if ( (1 << ffz(~chunk_size)) != chunk_size) {
a757e64c 1564 printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
1da177e4
LT
1565 return -EINVAL;
1566 }
1567 if (chunk_size < PAGE_SIZE) {
1568 printk(KERN_ERR "too small chunk_size: %d < %ld\n",
1569 chunk_size, PAGE_SIZE);
1570 return -EINVAL;
1571 }
1572
1573 /* devices must have minimum size of one chunk */
1574 ITERATE_RDEV(mddev,rdev,tmp) {
1575 if (rdev->faulty)
1576 continue;
1577 if (rdev->size < chunk_size / 1024) {
1578 printk(KERN_WARNING
1579 "md: Dev %s smaller than chunk_size:"
1580 " %lluk < %dk\n",
1581 bdevname(rdev->bdev,b),
1582 (unsigned long long)rdev->size,
1583 chunk_size / 1024);
1584 return -EINVAL;
1585 }
1586 }
1587 }
1588
1da177e4
LT
1589#ifdef CONFIG_KMOD
1590 if (!pers[pnum])
1591 {
1592 request_module("md-personality-%d", pnum);
1593 }
1594#endif
1595
1596 /*
1597 * Drop all container device buffers, from now on
1598 * the only valid external interface is through the md
1599 * device.
1600 * Also find largest hardsector size
1601 */
1602 ITERATE_RDEV(mddev,rdev,tmp) {
1603 if (rdev->faulty)
1604 continue;
1605 sync_blockdev(rdev->bdev);
1606 invalidate_bdev(rdev->bdev, 0);
1607 }
1608
1609 md_probe(mddev->unit, NULL, NULL);
1610 disk = mddev->gendisk;
1611 if (!disk)
1612 return -ENOMEM;
1613
1614 spin_lock(&pers_lock);
1615 if (!pers[pnum] || !try_module_get(pers[pnum]->owner)) {
1616 spin_unlock(&pers_lock);
1617 printk(KERN_WARNING "md: personality %d is not loaded!\n",
1618 pnum);
1619 return -EINVAL;
1620 }
1621
1622 mddev->pers = pers[pnum];
1623 spin_unlock(&pers_lock);
1624
1625 mddev->resync_max_sectors = mddev->size << 1; /* may be over-ridden by personality */
1626
1627 err = mddev->pers->run(mddev);
1628 if (err) {
1629 printk(KERN_ERR "md: pers->run() failed ...\n");
1630 module_put(mddev->pers->owner);
1631 mddev->pers = NULL;
1632 return -EINVAL;
1633 }
1634 atomic_set(&mddev->writes_pending,0);
1635 mddev->safemode = 0;
1636 mddev->safemode_timer.function = md_safemode_timeout;
1637 mddev->safemode_timer.data = (unsigned long) mddev;
1638 mddev->safemode_delay = (20 * HZ)/1000 +1; /* 20 msec delay */
1639 mddev->in_sync = 1;
1640
1641 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1642
1643 if (mddev->sb_dirty)
1644 md_update_sb(mddev);
1645
1646 set_capacity(disk, mddev->array_size<<1);
1647
1648 /* If we call blk_queue_make_request here, it will
1649 * re-initialise max_sectors etc which may have been
1650 * refined inside -> run. So just set the bits we need to set.
1651 * Most initialisation happended when we called
1652 * blk_queue_make_request(..., md_fail_request)
1653 * earlier.
1654 */
1655 mddev->queue->queuedata = mddev;
1656 mddev->queue->make_request_fn = mddev->pers->make_request;
1657
1658 mddev->changed = 1;
1659 return 0;
1660}
1661
1662static int restart_array(mddev_t *mddev)
1663{
1664 struct gendisk *disk = mddev->gendisk;
1665 int err;
1666
1667 /*
1668 * Complain if it has no devices
1669 */
1670 err = -ENXIO;
1671 if (list_empty(&mddev->disks))
1672 goto out;
1673
1674 if (mddev->pers) {
1675 err = -EBUSY;
1676 if (!mddev->ro)
1677 goto out;
1678
1679 mddev->safemode = 0;
1680 mddev->ro = 0;
1681 set_disk_ro(disk, 0);
1682
1683 printk(KERN_INFO "md: %s switched to read-write mode.\n",
1684 mdname(mddev));
1685 /*
1686 * Kick recovery or resync if necessary
1687 */
1688 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1689 md_wakeup_thread(mddev->thread);
1690 err = 0;
1691 } else {
1692 printk(KERN_ERR "md: %s has no personality assigned.\n",
1693 mdname(mddev));
1694 err = -EINVAL;
1695 }
1696
1697out:
1698 return err;
1699}
1700
1701static int do_md_stop(mddev_t * mddev, int ro)
1702{
1703 int err = 0;
1704 struct gendisk *disk = mddev->gendisk;
1705
1706 if (mddev->pers) {
1707 if (atomic_read(&mddev->active)>2) {
1708 printk("md: %s still in use.\n",mdname(mddev));
1709 return -EBUSY;
1710 }
1711
1712 if (mddev->sync_thread) {
1713 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1714 md_unregister_thread(mddev->sync_thread);
1715 mddev->sync_thread = NULL;
1716 }
1717
1718 del_timer_sync(&mddev->safemode_timer);
1719
1720 invalidate_partition(disk, 0);
1721
1722 if (ro) {
1723 err = -ENXIO;
1724 if (mddev->ro)
1725 goto out;
1726 mddev->ro = 1;
1727 } else {
1728 if (mddev->ro)
1729 set_disk_ro(disk, 0);
1730 blk_queue_make_request(mddev->queue, md_fail_request);
1731 mddev->pers->stop(mddev);
1732 module_put(mddev->pers->owner);
1733 mddev->pers = NULL;
1734 if (mddev->ro)
1735 mddev->ro = 0;
1736 }
1737 if (!mddev->in_sync) {
1738 /* mark array as shutdown cleanly */
1739 mddev->in_sync = 1;
1740 md_update_sb(mddev);
1741 }
1742 if (ro)
1743 set_disk_ro(disk, 1);
1744 }
1745 /*
1746 * Free resources if final stop
1747 */
1748 if (!ro) {
1749 struct gendisk *disk;
1750 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
1751
1752 export_array(mddev);
1753
1754 mddev->array_size = 0;
1755 disk = mddev->gendisk;
1756 if (disk)
1757 set_capacity(disk, 0);
1758 mddev->changed = 1;
1759 } else
1760 printk(KERN_INFO "md: %s switched to read-only mode.\n",
1761 mdname(mddev));
1762 err = 0;
1763out:
1764 return err;
1765}
1766
1767static void autorun_array(mddev_t *mddev)
1768{
1769 mdk_rdev_t *rdev;
1770 struct list_head *tmp;
1771 int err;
1772
a757e64c 1773 if (list_empty(&mddev->disks))
1da177e4 1774 return;
1da177e4
LT
1775
1776 printk(KERN_INFO "md: running: ");
1777
1778 ITERATE_RDEV(mddev,rdev,tmp) {
1779 char b[BDEVNAME_SIZE];
1780 printk("<%s>", bdevname(rdev->bdev,b));
1781 }
1782 printk("\n");
1783
1784 err = do_md_run (mddev);
1785 if (err) {
1786 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
1787 do_md_stop (mddev, 0);
1788 }
1789}
1790
1791/*
1792 * lets try to run arrays based on all disks that have arrived
1793 * until now. (those are in pending_raid_disks)
1794 *
1795 * the method: pick the first pending disk, collect all disks with
1796 * the same UUID, remove all from the pending list and put them into
1797 * the 'same_array' list. Then order this list based on superblock
1798 * update time (freshest comes first), kick out 'old' disks and
1799 * compare superblocks. If everything's fine then run it.
1800 *
1801 * If "unit" is allocated, then bump its reference count
1802 */
1803static void autorun_devices(int part)
1804{
1805 struct list_head candidates;
1806 struct list_head *tmp;
1807 mdk_rdev_t *rdev0, *rdev;
1808 mddev_t *mddev;
1809 char b[BDEVNAME_SIZE];
1810
1811 printk(KERN_INFO "md: autorun ...\n");
1812 while (!list_empty(&pending_raid_disks)) {
1813 dev_t dev;
1814 rdev0 = list_entry(pending_raid_disks.next,
1815 mdk_rdev_t, same_set);
1816
1817 printk(KERN_INFO "md: considering %s ...\n",
1818 bdevname(rdev0->bdev,b));
1819 INIT_LIST_HEAD(&candidates);
1820 ITERATE_RDEV_PENDING(rdev,tmp)
1821 if (super_90_load(rdev, rdev0, 0) >= 0) {
1822 printk(KERN_INFO "md: adding %s ...\n",
1823 bdevname(rdev->bdev,b));
1824 list_move(&rdev->same_set, &candidates);
1825 }
1826 /*
1827 * now we have a set of devices, with all of them having
1828 * mostly sane superblocks. It's time to allocate the
1829 * mddev.
1830 */
1831 if (rdev0->preferred_minor < 0 || rdev0->preferred_minor >= MAX_MD_DEVS) {
1832 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
1833 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
1834 break;
1835 }
1836 if (part)
1837 dev = MKDEV(mdp_major,
1838 rdev0->preferred_minor << MdpMinorShift);
1839 else
1840 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
1841
1842 md_probe(dev, NULL, NULL);
1843 mddev = mddev_find(dev);
1844 if (!mddev) {
1845 printk(KERN_ERR
1846 "md: cannot allocate memory for md drive.\n");
1847 break;
1848 }
1849 if (mddev_lock(mddev))
1850 printk(KERN_WARNING "md: %s locked, cannot run\n",
1851 mdname(mddev));
1852 else if (mddev->raid_disks || mddev->major_version
1853 || !list_empty(&mddev->disks)) {
1854 printk(KERN_WARNING
1855 "md: %s already running, cannot run %s\n",
1856 mdname(mddev), bdevname(rdev0->bdev,b));
1857 mddev_unlock(mddev);
1858 } else {
1859 printk(KERN_INFO "md: created %s\n", mdname(mddev));
1860 ITERATE_RDEV_GENERIC(candidates,rdev,tmp) {
1861 list_del_init(&rdev->same_set);
1862 if (bind_rdev_to_array(rdev, mddev))
1863 export_rdev(rdev);
1864 }
1865 autorun_array(mddev);
1866 mddev_unlock(mddev);
1867 }
1868 /* on success, candidates will be empty, on error
1869 * it won't...
1870 */
1871 ITERATE_RDEV_GENERIC(candidates,rdev,tmp)
1872 export_rdev(rdev);
1873 mddev_put(mddev);
1874 }
1875 printk(KERN_INFO "md: ... autorun DONE.\n");
1876}
1877
1878/*
1879 * import RAID devices based on one partition
1880 * if possible, the array gets run as well.
1881 */
1882
1883static int autostart_array(dev_t startdev)
1884{
1885 char b[BDEVNAME_SIZE];
1886 int err = -EINVAL, i;
1887 mdp_super_t *sb = NULL;
1888 mdk_rdev_t *start_rdev = NULL, *rdev;
1889
1890 start_rdev = md_import_device(startdev, 0, 0);
1891 if (IS_ERR(start_rdev))
1892 return err;
1893
1894
1895 /* NOTE: this can only work for 0.90.0 superblocks */
1896 sb = (mdp_super_t*)page_address(start_rdev->sb_page);
1897 if (sb->major_version != 0 ||
1898 sb->minor_version != 90 ) {
1899 printk(KERN_WARNING "md: can only autostart 0.90.0 arrays\n");
1900 export_rdev(start_rdev);
1901 return err;
1902 }
1903
1904 if (start_rdev->faulty) {
1905 printk(KERN_WARNING
1906 "md: can not autostart based on faulty %s!\n",
1907 bdevname(start_rdev->bdev,b));
1908 export_rdev(start_rdev);
1909 return err;
1910 }
1911 list_add(&start_rdev->same_set, &pending_raid_disks);
1912
1913 for (i = 0; i < MD_SB_DISKS; i++) {
1914 mdp_disk_t *desc = sb->disks + i;
1915 dev_t dev = MKDEV(desc->major, desc->minor);
1916
1917 if (!dev)
1918 continue;
1919 if (dev == startdev)
1920 continue;
1921 if (MAJOR(dev) != desc->major || MINOR(dev) != desc->minor)
1922 continue;
1923 rdev = md_import_device(dev, 0, 0);
1924 if (IS_ERR(rdev))
1925 continue;
1926
1927 list_add(&rdev->same_set, &pending_raid_disks);
1928 }
1929
1930 /*
1931 * possibly return codes
1932 */
1933 autorun_devices(0);
1934 return 0;
1935
1936}
1937
1938
1939static int get_version(void __user * arg)
1940{
1941 mdu_version_t ver;
1942
1943 ver.major = MD_MAJOR_VERSION;
1944 ver.minor = MD_MINOR_VERSION;
1945 ver.patchlevel = MD_PATCHLEVEL_VERSION;
1946
1947 if (copy_to_user(arg, &ver, sizeof(ver)))
1948 return -EFAULT;
1949
1950 return 0;
1951}
1952
1953static int get_array_info(mddev_t * mddev, void __user * arg)
1954{
1955 mdu_array_info_t info;
1956 int nr,working,active,failed,spare;
1957 mdk_rdev_t *rdev;
1958 struct list_head *tmp;
1959
1960 nr=working=active=failed=spare=0;
1961 ITERATE_RDEV(mddev,rdev,tmp) {
1962 nr++;
1963 if (rdev->faulty)
1964 failed++;
1965 else {
1966 working++;
1967 if (rdev->in_sync)
1968 active++;
1969 else
1970 spare++;
1971 }
1972 }
1973
1974 info.major_version = mddev->major_version;
1975 info.minor_version = mddev->minor_version;
1976 info.patch_version = MD_PATCHLEVEL_VERSION;
1977 info.ctime = mddev->ctime;
1978 info.level = mddev->level;
1979 info.size = mddev->size;
1980 info.nr_disks = nr;
1981 info.raid_disks = mddev->raid_disks;
1982 info.md_minor = mddev->md_minor;
1983 info.not_persistent= !mddev->persistent;
1984
1985 info.utime = mddev->utime;
1986 info.state = 0;
1987 if (mddev->in_sync)
1988 info.state = (1<<MD_SB_CLEAN);
1989 info.active_disks = active;
1990 info.working_disks = working;
1991 info.failed_disks = failed;
1992 info.spare_disks = spare;
1993
1994 info.layout = mddev->layout;
1995 info.chunk_size = mddev->chunk_size;
1996
1997 if (copy_to_user(arg, &info, sizeof(info)))
1998 return -EFAULT;
1999
2000 return 0;
2001}
2002
2003static int get_disk_info(mddev_t * mddev, void __user * arg)
2004{
2005 mdu_disk_info_t info;
2006 unsigned int nr;
2007 mdk_rdev_t *rdev;
2008
2009 if (copy_from_user(&info, arg, sizeof(info)))
2010 return -EFAULT;
2011
2012 nr = info.number;
2013
2014 rdev = find_rdev_nr(mddev, nr);
2015 if (rdev) {
2016 info.major = MAJOR(rdev->bdev->bd_dev);
2017 info.minor = MINOR(rdev->bdev->bd_dev);
2018 info.raid_disk = rdev->raid_disk;
2019 info.state = 0;
2020 if (rdev->faulty)
2021 info.state |= (1<<MD_DISK_FAULTY);
2022 else if (rdev->in_sync) {
2023 info.state |= (1<<MD_DISK_ACTIVE);
2024 info.state |= (1<<MD_DISK_SYNC);
2025 }
2026 } else {
2027 info.major = info.minor = 0;
2028 info.raid_disk = -1;
2029 info.state = (1<<MD_DISK_REMOVED);
2030 }
2031
2032 if (copy_to_user(arg, &info, sizeof(info)))
2033 return -EFAULT;
2034
2035 return 0;
2036}
2037
2038static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
2039{
2040 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
2041 mdk_rdev_t *rdev;
2042 dev_t dev = MKDEV(info->major,info->minor);
2043
2044 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
2045 return -EOVERFLOW;
2046
2047 if (!mddev->raid_disks) {
2048 int err;
2049 /* expecting a device which has a superblock */
2050 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
2051 if (IS_ERR(rdev)) {
2052 printk(KERN_WARNING
2053 "md: md_import_device returned %ld\n",
2054 PTR_ERR(rdev));
2055 return PTR_ERR(rdev);
2056 }
2057 if (!list_empty(&mddev->disks)) {
2058 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
2059 mdk_rdev_t, same_set);
2060 int err = super_types[mddev->major_version]
2061 .load_super(rdev, rdev0, mddev->minor_version);
2062 if (err < 0) {
2063 printk(KERN_WARNING
2064 "md: %s has different UUID to %s\n",
2065 bdevname(rdev->bdev,b),
2066 bdevname(rdev0->bdev,b2));
2067 export_rdev(rdev);
2068 return -EINVAL;
2069 }
2070 }
2071 err = bind_rdev_to_array(rdev, mddev);
2072 if (err)
2073 export_rdev(rdev);
2074 return err;
2075 }
2076
2077 /*
2078 * add_new_disk can be used once the array is assembled
2079 * to add "hot spares". They must already have a superblock
2080 * written
2081 */
2082 if (mddev->pers) {
2083 int err;
2084 if (!mddev->pers->hot_add_disk) {
2085 printk(KERN_WARNING
2086 "%s: personality does not support diskops!\n",
2087 mdname(mddev));
2088 return -EINVAL;
2089 }
2090 rdev = md_import_device(dev, mddev->major_version,
2091 mddev->minor_version);
2092 if (IS_ERR(rdev)) {
2093 printk(KERN_WARNING
2094 "md: md_import_device returned %ld\n",
2095 PTR_ERR(rdev));
2096 return PTR_ERR(rdev);
2097 }
2098 rdev->in_sync = 0; /* just to be sure */
2099 rdev->raid_disk = -1;
2100 err = bind_rdev_to_array(rdev, mddev);
2101 if (err)
2102 export_rdev(rdev);
c361777f
N
2103
2104 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1da177e4
LT
2105 if (mddev->thread)
2106 md_wakeup_thread(mddev->thread);
2107 return err;
2108 }
2109
2110 /* otherwise, add_new_disk is only allowed
2111 * for major_version==0 superblocks
2112 */
2113 if (mddev->major_version != 0) {
2114 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
2115 mdname(mddev));
2116 return -EINVAL;
2117 }
2118
2119 if (!(info->state & (1<<MD_DISK_FAULTY))) {
2120 int err;
2121 rdev = md_import_device (dev, -1, 0);
2122 if (IS_ERR(rdev)) {
2123 printk(KERN_WARNING
2124 "md: error, md_import_device() returned %ld\n",
2125 PTR_ERR(rdev));
2126 return PTR_ERR(rdev);
2127 }
2128 rdev->desc_nr = info->number;
2129 if (info->raid_disk < mddev->raid_disks)
2130 rdev->raid_disk = info->raid_disk;
2131 else
2132 rdev->raid_disk = -1;
2133
2134 rdev->faulty = 0;
2135 if (rdev->raid_disk < mddev->raid_disks)
2136 rdev->in_sync = (info->state & (1<<MD_DISK_SYNC));
2137 else
2138 rdev->in_sync = 0;
2139
2140 err = bind_rdev_to_array(rdev, mddev);
2141 if (err) {
2142 export_rdev(rdev);
2143 return err;
2144 }
2145
2146 if (!mddev->persistent) {
2147 printk(KERN_INFO "md: nonpersistent superblock ...\n");
2148 rdev->sb_offset = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2149 } else
2150 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2151 rdev->size = calc_dev_size(rdev, mddev->chunk_size);
2152
2153 if (!mddev->size || (mddev->size > rdev->size))
2154 mddev->size = rdev->size;
2155 }
2156
2157 return 0;
2158}
2159
2160static int hot_remove_disk(mddev_t * mddev, dev_t dev)
2161{
2162 char b[BDEVNAME_SIZE];
2163 mdk_rdev_t *rdev;
2164
2165 if (!mddev->pers)
2166 return -ENODEV;
2167
2168 rdev = find_rdev(mddev, dev);
2169 if (!rdev)
2170 return -ENXIO;
2171
2172 if (rdev->raid_disk >= 0)
2173 goto busy;
2174
2175 kick_rdev_from_array(rdev);
2176 md_update_sb(mddev);
2177
2178 return 0;
2179busy:
2180 printk(KERN_WARNING "md: cannot remove active disk %s from %s ... \n",
2181 bdevname(rdev->bdev,b), mdname(mddev));
2182 return -EBUSY;
2183}
2184
2185static int hot_add_disk(mddev_t * mddev, dev_t dev)
2186{
2187 char b[BDEVNAME_SIZE];
2188 int err;
2189 unsigned int size;
2190 mdk_rdev_t *rdev;
2191
2192 if (!mddev->pers)
2193 return -ENODEV;
2194
2195 if (mddev->major_version != 0) {
2196 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
2197 " version-0 superblocks.\n",
2198 mdname(mddev));
2199 return -EINVAL;
2200 }
2201 if (!mddev->pers->hot_add_disk) {
2202 printk(KERN_WARNING
2203 "%s: personality does not support diskops!\n",
2204 mdname(mddev));
2205 return -EINVAL;
2206 }
2207
2208 rdev = md_import_device (dev, -1, 0);
2209 if (IS_ERR(rdev)) {
2210 printk(KERN_WARNING
2211 "md: error, md_import_device() returned %ld\n",
2212 PTR_ERR(rdev));
2213 return -EINVAL;
2214 }
2215
2216 if (mddev->persistent)
2217 rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
2218 else
2219 rdev->sb_offset =
2220 rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2221
2222 size = calc_dev_size(rdev, mddev->chunk_size);
2223 rdev->size = size;
2224
2225 if (size < mddev->size) {
2226 printk(KERN_WARNING
2227 "%s: disk size %llu blocks < array size %llu\n",
2228 mdname(mddev), (unsigned long long)size,
2229 (unsigned long long)mddev->size);
2230 err = -ENOSPC;
2231 goto abort_export;
2232 }
2233
2234 if (rdev->faulty) {
2235 printk(KERN_WARNING
2236 "md: can not hot-add faulty %s disk to %s!\n",
2237 bdevname(rdev->bdev,b), mdname(mddev));
2238 err = -EINVAL;
2239 goto abort_export;
2240 }
2241 rdev->in_sync = 0;
2242 rdev->desc_nr = -1;
2243 bind_rdev_to_array(rdev, mddev);
2244
2245 /*
2246 * The rest should better be atomic, we can have disk failures
2247 * noticed in interrupt contexts ...
2248 */
2249
2250 if (rdev->desc_nr == mddev->max_disks) {
2251 printk(KERN_WARNING "%s: can not hot-add to full array!\n",
2252 mdname(mddev));
2253 err = -EBUSY;
2254 goto abort_unbind_export;
2255 }
2256
2257 rdev->raid_disk = -1;
2258
2259 md_update_sb(mddev);
2260
2261 /*
2262 * Kick recovery, maybe this spare has to be added to the
2263 * array immediately.
2264 */
2265 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2266 md_wakeup_thread(mddev->thread);
2267
2268 return 0;
2269
2270abort_unbind_export:
2271 unbind_rdev_from_array(rdev);
2272
2273abort_export:
2274 export_rdev(rdev);
2275 return err;
2276}
2277
2278/*
2279 * set_array_info is used two different ways
2280 * The original usage is when creating a new array.
2281 * In this usage, raid_disks is > 0 and it together with
2282 * level, size, not_persistent,layout,chunksize determine the
2283 * shape of the array.
2284 * This will always create an array with a type-0.90.0 superblock.
2285 * The newer usage is when assembling an array.
2286 * In this case raid_disks will be 0, and the major_version field is
2287 * use to determine which style super-blocks are to be found on the devices.
2288 * The minor and patch _version numbers are also kept incase the
2289 * super_block handler wishes to interpret them.
2290 */
2291static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
2292{
2293
2294 if (info->raid_disks == 0) {
2295 /* just setting version number for superblock loading */
2296 if (info->major_version < 0 ||
2297 info->major_version >= sizeof(super_types)/sizeof(super_types[0]) ||
2298 super_types[info->major_version].name == NULL) {
2299 /* maybe try to auto-load a module? */
2300 printk(KERN_INFO
2301 "md: superblock version %d not known\n",
2302 info->major_version);
2303 return -EINVAL;
2304 }
2305 mddev->major_version = info->major_version;
2306 mddev->minor_version = info->minor_version;
2307 mddev->patch_version = info->patch_version;
2308 return 0;
2309 }
2310 mddev->major_version = MD_MAJOR_VERSION;
2311 mddev->minor_version = MD_MINOR_VERSION;
2312 mddev->patch_version = MD_PATCHLEVEL_VERSION;
2313 mddev->ctime = get_seconds();
2314
2315 mddev->level = info->level;
2316 mddev->size = info->size;
2317 mddev->raid_disks = info->raid_disks;
2318 /* don't set md_minor, it is determined by which /dev/md* was
2319 * openned
2320 */
2321 if (info->state & (1<<MD_SB_CLEAN))
2322 mddev->recovery_cp = MaxSector;
2323 else
2324 mddev->recovery_cp = 0;
2325 mddev->persistent = ! info->not_persistent;
2326
2327 mddev->layout = info->layout;
2328 mddev->chunk_size = info->chunk_size;
2329
2330 mddev->max_disks = MD_SB_DISKS;
2331
2332 mddev->sb_dirty = 1;
2333
2334 /*
2335 * Generate a 128 bit UUID
2336 */
2337 get_random_bytes(mddev->uuid, 16);
2338
2339 return 0;
2340}
2341
2342/*
2343 * update_array_info is used to change the configuration of an
2344 * on-line array.
2345 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
2346 * fields in the info are checked against the array.
2347 * Any differences that cannot be handled will cause an error.
2348 * Normally, only one change can be managed at a time.
2349 */
2350static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
2351{
2352 int rv = 0;
2353 int cnt = 0;
2354
2355 if (mddev->major_version != info->major_version ||
2356 mddev->minor_version != info->minor_version ||
2357/* mddev->patch_version != info->patch_version || */
2358 mddev->ctime != info->ctime ||
2359 mddev->level != info->level ||
2360/* mddev->layout != info->layout || */
2361 !mddev->persistent != info->not_persistent||
2362 mddev->chunk_size != info->chunk_size )
2363 return -EINVAL;
2364 /* Check there is only one change */
2365 if (mddev->size != info->size) cnt++;
2366 if (mddev->raid_disks != info->raid_disks) cnt++;
2367 if (mddev->layout != info->layout) cnt++;
2368 if (cnt == 0) return 0;
2369 if (cnt > 1) return -EINVAL;
2370
2371 if (mddev->layout != info->layout) {
2372 /* Change layout
2373 * we don't need to do anything at the md level, the
2374 * personality will take care of it all.
2375 */
2376 if (mddev->pers->reconfig == NULL)
2377 return -EINVAL;
2378 else
2379 return mddev->pers->reconfig(mddev, info->layout, -1);
2380 }
2381 if (mddev->size != info->size) {
2382 mdk_rdev_t * rdev;
2383 struct list_head *tmp;
2384 if (mddev->pers->resize == NULL)
2385 return -EINVAL;
2386 /* The "size" is the amount of each device that is used.
2387 * This can only make sense for arrays with redundancy.
2388 * linear and raid0 always use whatever space is available
2389 * We can only consider changing the size if no resync
2390 * or reconstruction is happening, and if the new size
2391 * is acceptable. It must fit before the sb_offset or,
2392 * if that is <data_offset, it must fit before the
2393 * size of each device.
2394 * If size is zero, we find the largest size that fits.
2395 */
2396 if (mddev->sync_thread)
2397 return -EBUSY;
2398 ITERATE_RDEV(mddev,rdev,tmp) {
2399 sector_t avail;
2400 int fit = (info->size == 0);
2401 if (rdev->sb_offset > rdev->data_offset)
2402 avail = (rdev->sb_offset*2) - rdev->data_offset;
2403 else
2404 avail = get_capacity(rdev->bdev->bd_disk)
2405 - rdev->data_offset;
2406 if (fit && (info->size == 0 || info->size > avail/2))
2407 info->size = avail/2;
2408 if (avail < ((sector_t)info->size << 1))
2409 return -ENOSPC;
2410 }
2411 rv = mddev->pers->resize(mddev, (sector_t)info->size *2);
2412 if (!rv) {
2413 struct block_device *bdev;
2414
2415 bdev = bdget_disk(mddev->gendisk, 0);
2416 if (bdev) {
2417 down(&bdev->bd_inode->i_sem);
2418 i_size_write(bdev->bd_inode, mddev->array_size << 10);
2419 up(&bdev->bd_inode->i_sem);
2420 bdput(bdev);
2421 }
2422 }
2423 }
2424 if (mddev->raid_disks != info->raid_disks) {
2425 /* change the number of raid disks */
2426 if (mddev->pers->reshape == NULL)
2427 return -EINVAL;
2428 if (info->raid_disks <= 0 ||
2429 info->raid_disks >= mddev->max_disks)
2430 return -EINVAL;
2431 if (mddev->sync_thread)
2432 return -EBUSY;
2433 rv = mddev->pers->reshape(mddev, info->raid_disks);
2434 if (!rv) {
2435 struct block_device *bdev;
2436
2437 bdev = bdget_disk(mddev->gendisk, 0);
2438 if (bdev) {
2439 down(&bdev->bd_inode->i_sem);
2440 i_size_write(bdev->bd_inode, mddev->array_size << 10);
2441 up(&bdev->bd_inode->i_sem);
2442 bdput(bdev);
2443 }
2444 }
2445 }
2446 md_update_sb(mddev);
2447 return rv;
2448}
2449
2450static int set_disk_faulty(mddev_t *mddev, dev_t dev)
2451{
2452 mdk_rdev_t *rdev;
2453
2454 if (mddev->pers == NULL)
2455 return -ENODEV;
2456
2457 rdev = find_rdev(mddev, dev);
2458 if (!rdev)
2459 return -ENODEV;
2460
2461 md_error(mddev, rdev);
2462 return 0;
2463}
2464
2465static int md_ioctl(struct inode *inode, struct file *file,
2466 unsigned int cmd, unsigned long arg)
2467{
2468 int err = 0;
2469 void __user *argp = (void __user *)arg;
2470 struct hd_geometry __user *loc = argp;
2471 mddev_t *mddev = NULL;
2472
2473 if (!capable(CAP_SYS_ADMIN))
2474 return -EACCES;
2475
2476 /*
2477 * Commands dealing with the RAID driver but not any
2478 * particular array:
2479 */
2480 switch (cmd)
2481 {
2482 case RAID_VERSION:
2483 err = get_version(argp);
2484 goto done;
2485
2486 case PRINT_RAID_DEBUG:
2487 err = 0;
2488 md_print_devices();
2489 goto done;
2490
2491#ifndef MODULE
2492 case RAID_AUTORUN:
2493 err = 0;
2494 autostart_arrays(arg);
2495 goto done;
2496#endif
2497 default:;
2498 }
2499
2500 /*
2501 * Commands creating/starting a new array:
2502 */
2503
2504 mddev = inode->i_bdev->bd_disk->private_data;
2505
2506 if (!mddev) {
2507 BUG();
2508 goto abort;
2509 }
2510
2511
2512 if (cmd == START_ARRAY) {
2513 /* START_ARRAY doesn't need to lock the array as autostart_array
2514 * does the locking, and it could even be a different array
2515 */
2516 static int cnt = 3;
2517 if (cnt > 0 ) {
2518 printk(KERN_WARNING
2519 "md: %s(pid %d) used deprecated START_ARRAY ioctl. "
2520 "This will not be supported beyond 2.6\n",
2521 current->comm, current->pid);
2522 cnt--;
2523 }
2524 err = autostart_array(new_decode_dev(arg));
2525 if (err) {
2526 printk(KERN_WARNING "md: autostart failed!\n");
2527 goto abort;
2528 }
2529 goto done;
2530 }
2531
2532 err = mddev_lock(mddev);
2533 if (err) {
2534 printk(KERN_INFO
2535 "md: ioctl lock interrupted, reason %d, cmd %d\n",
2536 err, cmd);
2537 goto abort;
2538 }
2539
2540 switch (cmd)
2541 {
2542 case SET_ARRAY_INFO:
2543 {
2544 mdu_array_info_t info;
2545 if (!arg)
2546 memset(&info, 0, sizeof(info));
2547 else if (copy_from_user(&info, argp, sizeof(info))) {
2548 err = -EFAULT;
2549 goto abort_unlock;
2550 }
2551 if (mddev->pers) {
2552 err = update_array_info(mddev, &info);
2553 if (err) {
2554 printk(KERN_WARNING "md: couldn't update"
2555 " array info. %d\n", err);
2556 goto abort_unlock;
2557 }
2558 goto done_unlock;
2559 }
2560 if (!list_empty(&mddev->disks)) {
2561 printk(KERN_WARNING
2562 "md: array %s already has disks!\n",
2563 mdname(mddev));
2564 err = -EBUSY;
2565 goto abort_unlock;
2566 }
2567 if (mddev->raid_disks) {
2568 printk(KERN_WARNING
2569 "md: array %s already initialised!\n",
2570 mdname(mddev));
2571 err = -EBUSY;
2572 goto abort_unlock;
2573 }
2574 err = set_array_info(mddev, &info);
2575 if (err) {
2576 printk(KERN_WARNING "md: couldn't set"
2577 " array info. %d\n", err);
2578 goto abort_unlock;
2579 }
2580 }
2581 goto done_unlock;
2582
2583 default:;
2584 }
2585
2586 /*
2587 * Commands querying/configuring an existing array:
2588 */
2589 /* if we are initialised yet, only ADD_NEW_DISK or STOP_ARRAY is allowed */
2590 if (!mddev->raid_disks && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY && cmd != RUN_ARRAY) {
2591 err = -ENODEV;
2592 goto abort_unlock;
2593 }
2594
2595 /*
2596 * Commands even a read-only array can execute:
2597 */
2598 switch (cmd)
2599 {
2600 case GET_ARRAY_INFO:
2601 err = get_array_info(mddev, argp);
2602 goto done_unlock;
2603
2604 case GET_DISK_INFO:
2605 err = get_disk_info(mddev, argp);
2606 goto done_unlock;
2607
2608 case RESTART_ARRAY_RW:
2609 err = restart_array(mddev);
2610 goto done_unlock;
2611
2612 case STOP_ARRAY:
2613 err = do_md_stop (mddev, 0);
2614 goto done_unlock;
2615
2616 case STOP_ARRAY_RO:
2617 err = do_md_stop (mddev, 1);
2618 goto done_unlock;
2619
2620 /*
2621 * We have a problem here : there is no easy way to give a CHS
2622 * virtual geometry. We currently pretend that we have a 2 heads
2623 * 4 sectors (with a BIG number of cylinders...). This drives
2624 * dosfs just mad... ;-)
2625 */
2626 case HDIO_GETGEO:
2627 if (!loc) {
2628 err = -EINVAL;
2629 goto abort_unlock;
2630 }
2631 err = put_user (2, (char __user *) &loc->heads);
2632 if (err)
2633 goto abort_unlock;
2634 err = put_user (4, (char __user *) &loc->sectors);
2635 if (err)
2636 goto abort_unlock;
2637 err = put_user(get_capacity(mddev->gendisk)/8,
2638 (short __user *) &loc->cylinders);
2639 if (err)
2640 goto abort_unlock;
2641 err = put_user (get_start_sect(inode->i_bdev),
2642 (long __user *) &loc->start);
2643 goto done_unlock;
2644 }
2645
2646 /*
2647 * The remaining ioctls are changing the state of the
2648 * superblock, so we do not allow read-only arrays
2649 * here:
2650 */
2651 if (mddev->ro) {
2652 err = -EROFS;
2653 goto abort_unlock;
2654 }
2655
2656 switch (cmd)
2657 {
2658 case ADD_NEW_DISK:
2659 {
2660 mdu_disk_info_t info;
2661 if (copy_from_user(&info, argp, sizeof(info)))
2662 err = -EFAULT;
2663 else
2664 err = add_new_disk(mddev, &info);
2665 goto done_unlock;
2666 }
2667
2668 case HOT_REMOVE_DISK:
2669 err = hot_remove_disk(mddev, new_decode_dev(arg));
2670 goto done_unlock;
2671
2672 case HOT_ADD_DISK:
2673 err = hot_add_disk(mddev, new_decode_dev(arg));
2674 goto done_unlock;
2675
2676 case SET_DISK_FAULTY:
2677 err = set_disk_faulty(mddev, new_decode_dev(arg));
2678 goto done_unlock;
2679
2680 case RUN_ARRAY:
2681 err = do_md_run (mddev);
2682 goto done_unlock;
2683
2684 default:
2685 if (_IOC_TYPE(cmd) == MD_MAJOR)
2686 printk(KERN_WARNING "md: %s(pid %d) used"
2687 " obsolete MD ioctl, upgrade your"
2688 " software to use new ictls.\n",
2689 current->comm, current->pid);
2690 err = -EINVAL;
2691 goto abort_unlock;
2692 }
2693
2694done_unlock:
2695abort_unlock:
2696 mddev_unlock(mddev);
2697
2698 return err;
2699done:
2700 if (err)
2701 MD_BUG();
2702abort:
2703 return err;
2704}
2705
2706static int md_open(struct inode *inode, struct file *file)
2707{
2708 /*
2709 * Succeed if we can lock the mddev, which confirms that
2710 * it isn't being stopped right now.
2711 */
2712 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
2713 int err;
2714
2715 if ((err = mddev_lock(mddev)))
2716 goto out;
2717
2718 err = 0;
2719 mddev_get(mddev);
2720 mddev_unlock(mddev);
2721
2722 check_disk_change(inode->i_bdev);
2723 out:
2724 return err;
2725}
2726
2727static int md_release(struct inode *inode, struct file * file)
2728{
2729 mddev_t *mddev = inode->i_bdev->bd_disk->private_data;
2730
2731 if (!mddev)
2732 BUG();
2733 mddev_put(mddev);
2734
2735 return 0;
2736}
2737
2738static int md_media_changed(struct gendisk *disk)
2739{
2740 mddev_t *mddev = disk->private_data;
2741
2742 return mddev->changed;
2743}
2744
2745static int md_revalidate(struct gendisk *disk)
2746{
2747 mddev_t *mddev = disk->private_data;
2748
2749 mddev->changed = 0;
2750 return 0;
2751}
2752static struct block_device_operations md_fops =
2753{
2754 .owner = THIS_MODULE,
2755 .open = md_open,
2756 .release = md_release,
2757 .ioctl = md_ioctl,
2758 .media_changed = md_media_changed,
2759 .revalidate_disk= md_revalidate,
2760};
2761
75c96f85 2762static int md_thread(void * arg)
1da177e4
LT
2763{
2764 mdk_thread_t *thread = arg;
2765
2766 lock_kernel();
2767
2768 /*
2769 * Detach thread
2770 */
2771
2772 daemonize(thread->name, mdname(thread->mddev));
2773
2774 current->exit_signal = SIGCHLD;
2775 allow_signal(SIGKILL);
2776 thread->tsk = current;
2777
2778 /*
2779 * md_thread is a 'system-thread', it's priority should be very
2780 * high. We avoid resource deadlocks individually in each
2781 * raid personality. (RAID5 does preallocation) We also use RR and
2782 * the very same RT priority as kswapd, thus we will never get
2783 * into a priority inversion deadlock.
2784 *
2785 * we definitely have to have equal or higher priority than
2786 * bdflush, otherwise bdflush will deadlock if there are too
2787 * many dirty RAID5 blocks.
2788 */
2789 unlock_kernel();
2790
2791 complete(thread->event);
2792 while (thread->run) {
2793 void (*run)(mddev_t *);
2794
2795 wait_event_interruptible(thread->wqueue,
2796 test_bit(THREAD_WAKEUP, &thread->flags));
2797 if (current->flags & PF_FREEZE)
2798 refrigerator(PF_FREEZE);
2799
2800 clear_bit(THREAD_WAKEUP, &thread->flags);
2801
2802 run = thread->run;
2803 if (run)
2804 run(thread->mddev);
2805
2806 if (signal_pending(current))
2807 flush_signals(current);
2808 }
2809 complete(thread->event);
2810 return 0;
2811}
2812
2813void md_wakeup_thread(mdk_thread_t *thread)
2814{
2815 if (thread) {
2816 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
2817 set_bit(THREAD_WAKEUP, &thread->flags);
2818 wake_up(&thread->wqueue);
2819 }
2820}
2821
2822mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
2823 const char *name)
2824{
2825 mdk_thread_t *thread;
2826 int ret;
2827 struct completion event;
2828
2829 thread = (mdk_thread_t *) kmalloc
2830 (sizeof(mdk_thread_t), GFP_KERNEL);
2831 if (!thread)
2832 return NULL;
2833
2834 memset(thread, 0, sizeof(mdk_thread_t));
2835 init_waitqueue_head(&thread->wqueue);
2836
2837 init_completion(&event);
2838 thread->event = &event;
2839 thread->run = run;
2840 thread->mddev = mddev;
2841 thread->name = name;
2842 ret = kernel_thread(md_thread, thread, 0);
2843 if (ret < 0) {
2844 kfree(thread);
2845 return NULL;
2846 }
2847 wait_for_completion(&event);
2848 return thread;
2849}
2850
1da177e4
LT
2851void md_unregister_thread(mdk_thread_t *thread)
2852{
2853 struct completion event;
2854
2855 init_completion(&event);
2856
2857 thread->event = &event;
d28446fe
N
2858
2859 /* As soon as ->run is set to NULL, the task could disappear,
2860 * so we need to hold tasklist_lock until we have sent the signal
2861 */
2862 dprintk("interrupting MD-thread pid %d\n", thread->tsk->pid);
2863 read_lock(&tasklist_lock);
1da177e4 2864 thread->run = NULL;
d28446fe
N
2865 send_sig(SIGKILL, thread->tsk, 1);
2866 read_unlock(&tasklist_lock);
1da177e4
LT
2867 wait_for_completion(&event);
2868 kfree(thread);
2869}
2870
2871void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
2872{
2873 if (!mddev) {
2874 MD_BUG();
2875 return;
2876 }
2877
2878 if (!rdev || rdev->faulty)
2879 return;
2880
2881 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
2882 mdname(mddev),
2883 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
2884 __builtin_return_address(0),__builtin_return_address(1),
2885 __builtin_return_address(2),__builtin_return_address(3));
2886
2887 if (!mddev->pers->error_handler)
2888 return;
2889 mddev->pers->error_handler(mddev,rdev);
2890 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2891 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2892 md_wakeup_thread(mddev->thread);
2893}
2894
2895/* seq_file implementation /proc/mdstat */
2896
2897static void status_unused(struct seq_file *seq)
2898{
2899 int i = 0;
2900 mdk_rdev_t *rdev;
2901 struct list_head *tmp;
2902
2903 seq_printf(seq, "unused devices: ");
2904
2905 ITERATE_RDEV_PENDING(rdev,tmp) {
2906 char b[BDEVNAME_SIZE];
2907 i++;
2908 seq_printf(seq, "%s ",
2909 bdevname(rdev->bdev,b));
2910 }
2911 if (!i)
2912 seq_printf(seq, "<none>");
2913
2914 seq_printf(seq, "\n");
2915}
2916
2917
2918static void status_resync(struct seq_file *seq, mddev_t * mddev)
2919{
2920 unsigned long max_blocks, resync, res, dt, db, rt;
2921
2922 resync = (mddev->curr_resync - atomic_read(&mddev->recovery_active))/2;
2923
2924 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2925 max_blocks = mddev->resync_max_sectors >> 1;
2926 else
2927 max_blocks = mddev->size;
2928
2929 /*
2930 * Should not happen.
2931 */
2932 if (!max_blocks) {
2933 MD_BUG();
2934 return;
2935 }
2936 res = (resync/1024)*1000/(max_blocks/1024 + 1);
2937 {
2938 int i, x = res/50, y = 20-x;
2939 seq_printf(seq, "[");
2940 for (i = 0; i < x; i++)
2941 seq_printf(seq, "=");
2942 seq_printf(seq, ">");
2943 for (i = 0; i < y; i++)
2944 seq_printf(seq, ".");
2945 seq_printf(seq, "] ");
2946 }
2947 seq_printf(seq, " %s =%3lu.%lu%% (%lu/%lu)",
2948 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
2949 "resync" : "recovery"),
2950 res/10, res % 10, resync, max_blocks);
2951
2952 /*
2953 * We do not want to overflow, so the order of operands and
2954 * the * 100 / 100 trick are important. We do a +1 to be
2955 * safe against division by zero. We only estimate anyway.
2956 *
2957 * dt: time from mark until now
2958 * db: blocks written from mark until now
2959 * rt: remaining time
2960 */
2961 dt = ((jiffies - mddev->resync_mark) / HZ);
2962 if (!dt) dt++;
2963 db = resync - (mddev->resync_mark_cnt/2);
2964 rt = (dt * ((max_blocks-resync) / (db/100+1)))/100;
2965
2966 seq_printf(seq, " finish=%lu.%lumin", rt / 60, (rt % 60)/6);
2967
2968 seq_printf(seq, " speed=%ldK/sec", db/dt);
2969}
2970
2971static void *md_seq_start(struct seq_file *seq, loff_t *pos)
2972{
2973 struct list_head *tmp;
2974 loff_t l = *pos;
2975 mddev_t *mddev;
2976
2977 if (l >= 0x10000)
2978 return NULL;
2979 if (!l--)
2980 /* header */
2981 return (void*)1;
2982
2983 spin_lock(&all_mddevs_lock);
2984 list_for_each(tmp,&all_mddevs)
2985 if (!l--) {
2986 mddev = list_entry(tmp, mddev_t, all_mddevs);
2987 mddev_get(mddev);
2988 spin_unlock(&all_mddevs_lock);
2989 return mddev;
2990 }
2991 spin_unlock(&all_mddevs_lock);
2992 if (!l--)
2993 return (void*)2;/* tail */
2994 return NULL;
2995}
2996
2997static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2998{
2999 struct list_head *tmp;
3000 mddev_t *next_mddev, *mddev = v;
3001
3002 ++*pos;
3003 if (v == (void*)2)
3004 return NULL;
3005
3006 spin_lock(&all_mddevs_lock);
3007 if (v == (void*)1)
3008 tmp = all_mddevs.next;
3009 else
3010 tmp = mddev->all_mddevs.next;
3011 if (tmp != &all_mddevs)
3012 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
3013 else {
3014 next_mddev = (void*)2;
3015 *pos = 0x10000;
3016 }
3017 spin_unlock(&all_mddevs_lock);
3018
3019 if (v != (void*)1)
3020 mddev_put(mddev);
3021 return next_mddev;
3022
3023}
3024
3025static void md_seq_stop(struct seq_file *seq, void *v)
3026{
3027 mddev_t *mddev = v;
3028
3029 if (mddev && v != (void*)1 && v != (void*)2)
3030 mddev_put(mddev);
3031}
3032
3033static int md_seq_show(struct seq_file *seq, void *v)
3034{
3035 mddev_t *mddev = v;
3036 sector_t size;
3037 struct list_head *tmp2;
3038 mdk_rdev_t *rdev;
3039 int i;
3040
3041 if (v == (void*)1) {
3042 seq_printf(seq, "Personalities : ");
3043 spin_lock(&pers_lock);
3044 for (i = 0; i < MAX_PERSONALITY; i++)
3045 if (pers[i])
3046 seq_printf(seq, "[%s] ", pers[i]->name);
3047
3048 spin_unlock(&pers_lock);
3049 seq_printf(seq, "\n");
3050 return 0;
3051 }
3052 if (v == (void*)2) {
3053 status_unused(seq);
3054 return 0;
3055 }
3056
3057 if (mddev_lock(mddev)!=0)
3058 return -EINTR;
3059 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
3060 seq_printf(seq, "%s : %sactive", mdname(mddev),
3061 mddev->pers ? "" : "in");
3062 if (mddev->pers) {
3063 if (mddev->ro)
3064 seq_printf(seq, " (read-only)");
3065 seq_printf(seq, " %s", mddev->pers->name);
3066 }
3067
3068 size = 0;
3069 ITERATE_RDEV(mddev,rdev,tmp2) {
3070 char b[BDEVNAME_SIZE];
3071 seq_printf(seq, " %s[%d]",
3072 bdevname(rdev->bdev,b), rdev->desc_nr);
3073 if (rdev->faulty) {
3074 seq_printf(seq, "(F)");
3075 continue;
3076 }
3077 size += rdev->size;
3078 }
3079
3080 if (!list_empty(&mddev->disks)) {
3081 if (mddev->pers)
3082 seq_printf(seq, "\n %llu blocks",
3083 (unsigned long long)mddev->array_size);
3084 else
3085 seq_printf(seq, "\n %llu blocks",
3086 (unsigned long long)size);
3087 }
3088
3089 if (mddev->pers) {
3090 mddev->pers->status (seq, mddev);
3091 seq_printf(seq, "\n ");
3092 if (mddev->curr_resync > 2)
3093 status_resync (seq, mddev);
3094 else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
3095 seq_printf(seq, " resync=DELAYED");
3096 }
3097
3098 seq_printf(seq, "\n");
3099 }
3100 mddev_unlock(mddev);
3101
3102 return 0;
3103}
3104
3105static struct seq_operations md_seq_ops = {
3106 .start = md_seq_start,
3107 .next = md_seq_next,
3108 .stop = md_seq_stop,
3109 .show = md_seq_show,
3110};
3111
3112static int md_seq_open(struct inode *inode, struct file *file)
3113{
3114 int error;
3115
3116 error = seq_open(file, &md_seq_ops);
3117 return error;
3118}
3119
3120static struct file_operations md_seq_fops = {
3121 .open = md_seq_open,
3122 .read = seq_read,
3123 .llseek = seq_lseek,
3124 .release = seq_release,
3125};
3126
3127int register_md_personality(int pnum, mdk_personality_t *p)
3128{
3129 if (pnum >= MAX_PERSONALITY) {
3130 printk(KERN_ERR
3131 "md: tried to install personality %s as nr %d, but max is %lu\n",
3132 p->name, pnum, MAX_PERSONALITY-1);
3133 return -EINVAL;
3134 }
3135
3136 spin_lock(&pers_lock);
3137 if (pers[pnum]) {
3138 spin_unlock(&pers_lock);
1da177e4
LT
3139 return -EBUSY;
3140 }
3141
3142 pers[pnum] = p;
3143 printk(KERN_INFO "md: %s personality registered as nr %d\n", p->name, pnum);
3144 spin_unlock(&pers_lock);
3145 return 0;
3146}
3147
3148int unregister_md_personality(int pnum)
3149{
a757e64c 3150 if (pnum >= MAX_PERSONALITY)
1da177e4 3151 return -EINVAL;
1da177e4
LT
3152
3153 printk(KERN_INFO "md: %s personality unregistered\n", pers[pnum]->name);
3154 spin_lock(&pers_lock);
3155 pers[pnum] = NULL;
3156 spin_unlock(&pers_lock);
3157 return 0;
3158}
3159
3160static int is_mddev_idle(mddev_t *mddev)
3161{
3162 mdk_rdev_t * rdev;
3163 struct list_head *tmp;
3164 int idle;
3165 unsigned long curr_events;
3166
3167 idle = 1;
3168 ITERATE_RDEV(mddev,rdev,tmp) {
3169 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
3170 curr_events = disk_stat_read(disk, read_sectors) +
3171 disk_stat_read(disk, write_sectors) -
3172 atomic_read(&disk->sync_io);
3173 /* Allow some slack between valud of curr_events and last_events,
3174 * as there are some uninteresting races.
3175 * Note: the following is an unsigned comparison.
3176 */
3177 if ((curr_events - rdev->last_events + 32) > 64) {
3178 rdev->last_events = curr_events;
3179 idle = 0;
3180 }
3181 }
3182 return idle;
3183}
3184
3185void md_done_sync(mddev_t *mddev, int blocks, int ok)
3186{
3187 /* another "blocks" (512byte) blocks have been synced */
3188 atomic_sub(blocks, &mddev->recovery_active);
3189 wake_up(&mddev->recovery_wait);
3190 if (!ok) {
3191 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
3192 md_wakeup_thread(mddev->thread);
3193 // stop recovery, signal do_sync ....
3194 }
3195}
3196
3197
06d91a5f
N
3198/* md_write_start(mddev, bi)
3199 * If we need to update some array metadata (e.g. 'active' flag
3200 * in superblock) before writing, queue bi for later writing
3201 * and return 0, else return 1 and it will be written now
3202 */
3203int md_write_start(mddev_t *mddev, struct bio *bi)
1da177e4 3204{
06d91a5f
N
3205 if (bio_data_dir(bi) != WRITE)
3206 return 1;
3207
3208 atomic_inc(&mddev->writes_pending);
3209 spin_lock(&mddev->write_lock);
3210 if (mddev->in_sync == 0 && mddev->sb_dirty == 0) {
3211 spin_unlock(&mddev->write_lock);
3212 return 1;
3213 }
3214 bio_list_add(&mddev->write_list, bi);
3215
3216 if (mddev->in_sync) {
3217 mddev->in_sync = 0;
3218 mddev->sb_dirty = 1;
3219 }
3220 spin_unlock(&mddev->write_lock);
3221 md_wakeup_thread(mddev->thread);
3222 return 0;
1da177e4
LT
3223}
3224
3225void md_write_end(mddev_t *mddev)
3226{
3227 if (atomic_dec_and_test(&mddev->writes_pending)) {
3228 if (mddev->safemode == 2)
3229 md_wakeup_thread(mddev->thread);
3230 else
3231 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
3232 }
3233}
3234
75c96f85 3235static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
1da177e4
LT
3236
3237#define SYNC_MARKS 10
3238#define SYNC_MARK_STEP (3*HZ)
3239static void md_do_sync(mddev_t *mddev)
3240{
3241 mddev_t *mddev2;
3242 unsigned int currspeed = 0,
3243 window;
57afd89f 3244 sector_t max_sectors,j, io_sectors;
1da177e4
LT
3245 unsigned long mark[SYNC_MARKS];
3246 sector_t mark_cnt[SYNC_MARKS];
3247 int last_mark,m;
3248 struct list_head *tmp;
3249 sector_t last_check;
57afd89f 3250 int skipped = 0;
1da177e4
LT
3251
3252 /* just incase thread restarts... */
3253 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
3254 return;
3255
3256 /* we overload curr_resync somewhat here.
3257 * 0 == not engaged in resync at all
3258 * 2 == checking that there is no conflict with another sync
3259 * 1 == like 2, but have yielded to allow conflicting resync to
3260 * commense
3261 * other == active in resync - this many blocks
3262 *
3263 * Before starting a resync we must have set curr_resync to
3264 * 2, and then checked that every "conflicting" array has curr_resync
3265 * less than ours. When we find one that is the same or higher
3266 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
3267 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
3268 * This will mean we have to start checking from the beginning again.
3269 *
3270 */
3271
3272 do {
3273 mddev->curr_resync = 2;
3274
3275 try_again:
3276 if (signal_pending(current)) {
3277 flush_signals(current);
3278 goto skip;
3279 }
3280 ITERATE_MDDEV(mddev2,tmp) {
3281 printk(".");
3282 if (mddev2 == mddev)
3283 continue;
3284 if (mddev2->curr_resync &&
3285 match_mddev_units(mddev,mddev2)) {
3286 DEFINE_WAIT(wq);
3287 if (mddev < mddev2 && mddev->curr_resync == 2) {
3288 /* arbitrarily yield */
3289 mddev->curr_resync = 1;
3290 wake_up(&resync_wait);
3291 }
3292 if (mddev > mddev2 && mddev->curr_resync == 1)
3293 /* no need to wait here, we can wait the next
3294 * time 'round when curr_resync == 2
3295 */
3296 continue;
3297 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
3298 if (!signal_pending(current)
3299 && mddev2->curr_resync >= mddev->curr_resync) {
3300 printk(KERN_INFO "md: delaying resync of %s"
3301 " until %s has finished resync (they"
3302 " share one or more physical units)\n",
3303 mdname(mddev), mdname(mddev2));
3304 mddev_put(mddev2);
3305 schedule();
3306 finish_wait(&resync_wait, &wq);
3307 goto try_again;
3308 }
3309 finish_wait(&resync_wait, &wq);
3310 }
3311 }
3312 } while (mddev->curr_resync < 2);
3313
3314 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3315 /* resync follows the size requested by the personality,
57afd89f 3316 * which defaults to physical size, but can be virtual size
1da177e4
LT
3317 */
3318 max_sectors = mddev->resync_max_sectors;
3319 else
3320 /* recovery follows the physical size of devices */
3321 max_sectors = mddev->size << 1;
3322
3323 printk(KERN_INFO "md: syncing RAID array %s\n", mdname(mddev));
3324 printk(KERN_INFO "md: minimum _guaranteed_ reconstruction speed:"
3325 " %d KB/sec/disc.\n", sysctl_speed_limit_min);
3326 printk(KERN_INFO "md: using maximum available idle IO bandwith "
3327 "(but not more than %d KB/sec) for reconstruction.\n",
3328 sysctl_speed_limit_max);
3329
3330 is_mddev_idle(mddev); /* this also initializes IO event counters */
3331 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3332 j = mddev->recovery_cp;
3333 else
3334 j = 0;
57afd89f 3335 io_sectors = 0;
1da177e4
LT
3336 for (m = 0; m < SYNC_MARKS; m++) {
3337 mark[m] = jiffies;
57afd89f 3338 mark_cnt[m] = io_sectors;
1da177e4
LT
3339 }
3340 last_mark = 0;
3341 mddev->resync_mark = mark[last_mark];
3342 mddev->resync_mark_cnt = mark_cnt[last_mark];
3343
3344 /*
3345 * Tune reconstruction:
3346 */
3347 window = 32*(PAGE_SIZE/512);
3348 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
3349 window/2,(unsigned long long) max_sectors/2);
3350
3351 atomic_set(&mddev->recovery_active, 0);
3352 init_waitqueue_head(&mddev->recovery_wait);
3353 last_check = 0;
3354
3355 if (j>2) {
3356 printk(KERN_INFO
3357 "md: resuming recovery of %s from checkpoint.\n",
3358 mdname(mddev));
3359 mddev->curr_resync = j;
3360 }
3361
3362 while (j < max_sectors) {
57afd89f 3363 sector_t sectors;
1da177e4 3364
57afd89f
N
3365 skipped = 0;
3366 sectors = mddev->pers->sync_request(mddev, j, &skipped,
3367 currspeed < sysctl_speed_limit_min);
3368 if (sectors == 0) {
1da177e4
LT
3369 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
3370 goto out;
3371 }
57afd89f
N
3372
3373 if (!skipped) { /* actual IO requested */
3374 io_sectors += sectors;
3375 atomic_add(sectors, &mddev->recovery_active);
3376 }
3377
1da177e4
LT
3378 j += sectors;
3379 if (j>1) mddev->curr_resync = j;
3380
57afd89f
N
3381
3382 if (last_check + window > io_sectors || j == max_sectors)
1da177e4
LT
3383 continue;
3384
57afd89f 3385 last_check = io_sectors;
1da177e4
LT
3386
3387 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
3388 test_bit(MD_RECOVERY_ERR, &mddev->recovery))
3389 break;
3390
3391 repeat:
3392 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
3393 /* step marks */
3394 int next = (last_mark+1) % SYNC_MARKS;
3395
3396 mddev->resync_mark = mark[next];
3397 mddev->resync_mark_cnt = mark_cnt[next];
3398 mark[next] = jiffies;
57afd89f 3399 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
1da177e4
LT
3400 last_mark = next;
3401 }
3402
3403
3404 if (signal_pending(current)) {
3405 /*
3406 * got a signal, exit.
3407 */
3408 printk(KERN_INFO
3409 "md: md_do_sync() got signal ... exiting\n");
3410 flush_signals(current);
3411 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3412 goto out;
3413 }
3414
3415 /*
3416 * this loop exits only if either when we are slower than
3417 * the 'hard' speed limit, or the system was IO-idle for
3418 * a jiffy.
3419 * the system might be non-idle CPU-wise, but we only care
3420 * about not overloading the IO subsystem. (things like an
3421 * e2fsck being done on the RAID array should execute fast)
3422 */
3423 mddev->queue->unplug_fn(mddev->queue);
3424 cond_resched();
3425
57afd89f
N
3426 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
3427 /((jiffies-mddev->resync_mark)/HZ +1) +1;
1da177e4
LT
3428
3429 if (currspeed > sysctl_speed_limit_min) {
3430 if ((currspeed > sysctl_speed_limit_max) ||
3431 !is_mddev_idle(mddev)) {
3432 msleep_interruptible(250);
3433 goto repeat;
3434 }
3435 }
3436 }
3437 printk(KERN_INFO "md: %s: sync done.\n",mdname(mddev));
3438 /*
3439 * this also signals 'finished resyncing' to md_stop
3440 */
3441 out:
3442 mddev->queue->unplug_fn(mddev->queue);
3443
3444 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
3445
3446 /* tell personality that we are finished */
57afd89f 3447 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
1da177e4
LT
3448
3449 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
3450 mddev->curr_resync > 2 &&
3451 mddev->curr_resync >= mddev->recovery_cp) {
3452 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
3453 printk(KERN_INFO
3454 "md: checkpointing recovery of %s.\n",
3455 mdname(mddev));
3456 mddev->recovery_cp = mddev->curr_resync;
3457 } else
3458 mddev->recovery_cp = MaxSector;
3459 }
3460
1da177e4
LT
3461 skip:
3462 mddev->curr_resync = 0;
3463 wake_up(&resync_wait);
3464 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
3465 md_wakeup_thread(mddev->thread);
3466}
3467
3468
3469/*
3470 * This routine is regularly called by all per-raid-array threads to
3471 * deal with generic issues like resync and super-block update.
3472 * Raid personalities that don't have a thread (linear/raid0) do not
3473 * need this as they never do any recovery or update the superblock.
3474 *
3475 * It does not do any resync itself, but rather "forks" off other threads
3476 * to do that as needed.
3477 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
3478 * "->recovery" and create a thread at ->sync_thread.
3479 * When the thread finishes it sets MD_RECOVERY_DONE (and might set MD_RECOVERY_ERR)
3480 * and wakeups up this thread which will reap the thread and finish up.
3481 * This thread also removes any faulty devices (with nr_pending == 0).
3482 *
3483 * The overall approach is:
3484 * 1/ if the superblock needs updating, update it.
3485 * 2/ If a recovery thread is running, don't do anything else.
3486 * 3/ If recovery has finished, clean up, possibly marking spares active.
3487 * 4/ If there are any faulty devices, remove them.
3488 * 5/ If array is degraded, try to add spares devices
3489 * 6/ If array has spares or is not in-sync, start a resync thread.
3490 */
3491void md_check_recovery(mddev_t *mddev)
3492{
3493 mdk_rdev_t *rdev;
3494 struct list_head *rtmp;
3495
3496
3497 dprintk(KERN_INFO "md: recovery thread got woken up ...\n");
3498
3499 if (mddev->ro)
3500 return;
fca4d848
N
3501
3502 if (signal_pending(current)) {
3503 if (mddev->pers->sync_request) {
3504 printk(KERN_INFO "md: %s in immediate safe mode\n",
3505 mdname(mddev));
3506 mddev->safemode = 2;
3507 }
3508 flush_signals(current);
3509 }
3510
1da177e4
LT
3511 if ( ! (
3512 mddev->sb_dirty ||
3513 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
fca4d848 3514 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
06d91a5f 3515 mddev->write_list.head ||
fca4d848
N
3516 (mddev->safemode == 1) ||
3517 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
3518 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
1da177e4
LT
3519 ))
3520 return;
fca4d848 3521
1da177e4
LT
3522 if (mddev_trylock(mddev)==0) {
3523 int spares =0;
06d91a5f 3524 struct bio *blist;
fca4d848 3525
06d91a5f 3526 spin_lock(&mddev->write_lock);
fca4d848
N
3527 if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
3528 !mddev->in_sync && mddev->recovery_cp == MaxSector) {
3529 mddev->in_sync = 1;
3530 mddev->sb_dirty = 1;
3531 }
3532 if (mddev->safemode == 1)
3533 mddev->safemode = 0;
06d91a5f
N
3534 blist = bio_list_get(&mddev->write_list);
3535 spin_unlock(&mddev->write_lock);
fca4d848 3536
1da177e4
LT
3537 if (mddev->sb_dirty)
3538 md_update_sb(mddev);
06d91a5f
N
3539
3540 while (blist) {
3541 struct bio *b = blist;
3542 blist = blist->bi_next;
3543 b->bi_next = NULL;
3544 generic_make_request(b);
3545 /* we already counted this, so need to un-count */
3546 md_write_end(mddev);
3547 }
3548
3549
1da177e4
LT
3550 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
3551 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
3552 /* resync/recovery still happening */
3553 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3554 goto unlock;
3555 }
3556 if (mddev->sync_thread) {
3557 /* resync has finished, collect result */
3558 md_unregister_thread(mddev->sync_thread);
3559 mddev->sync_thread = NULL;
3560 if (!test_bit(MD_RECOVERY_ERR, &mddev->recovery) &&
3561 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
3562 /* success...*/
3563 /* activate any spares */
3564 mddev->pers->spare_active(mddev);
3565 }
3566 md_update_sb(mddev);
3567 mddev->recovery = 0;
3568 /* flag recovery needed just to double check */
3569 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3570 goto unlock;
3571 }
3572 if (mddev->recovery)
3573 /* probably just the RECOVERY_NEEDED flag */
3574 mddev->recovery = 0;
3575
3576 /* no recovery is running.
3577 * remove any failed drives, then
3578 * add spares if possible.
3579 * Spare are also removed and re-added, to allow
3580 * the personality to fail the re-add.
3581 */
3582 ITERATE_RDEV(mddev,rdev,rtmp)
3583 if (rdev->raid_disk >= 0 &&
3584 (rdev->faulty || ! rdev->in_sync) &&
3585 atomic_read(&rdev->nr_pending)==0) {
3586 if (mddev->pers->hot_remove_disk(mddev, rdev->raid_disk)==0)
3587 rdev->raid_disk = -1;
3588 }
3589
3590 if (mddev->degraded) {
3591 ITERATE_RDEV(mddev,rdev,rtmp)
3592 if (rdev->raid_disk < 0
3593 && !rdev->faulty) {
3594 if (mddev->pers->hot_add_disk(mddev,rdev))
3595 spares++;
3596 else
3597 break;
3598 }
3599 }
3600
3601 if (!spares && (mddev->recovery_cp == MaxSector )) {
3602 /* nothing we can do ... */
3603 goto unlock;
3604 }
3605 if (mddev->pers->sync_request) {
3606 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3607 if (!spares)
3608 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3609 mddev->sync_thread = md_register_thread(md_do_sync,
3610 mddev,
3611 "%s_resync");
3612 if (!mddev->sync_thread) {
3613 printk(KERN_ERR "%s: could not start resync"
3614 " thread...\n",
3615 mdname(mddev));
3616 /* leave the spares where they are, it shouldn't hurt */
3617 mddev->recovery = 0;
3618 } else {
3619 md_wakeup_thread(mddev->sync_thread);
3620 }
3621 }
3622 unlock:
3623 mddev_unlock(mddev);
3624 }
3625}
3626
75c96f85
AB
3627static int md_notify_reboot(struct notifier_block *this,
3628 unsigned long code, void *x)
1da177e4
LT
3629{
3630 struct list_head *tmp;
3631 mddev_t *mddev;
3632
3633 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
3634
3635 printk(KERN_INFO "md: stopping all md devices.\n");
3636
3637 ITERATE_MDDEV(mddev,tmp)
3638 if (mddev_trylock(mddev)==0)
3639 do_md_stop (mddev, 1);
3640 /*
3641 * certain more exotic SCSI devices are known to be
3642 * volatile wrt too early system reboots. While the
3643 * right place to handle this issue is the given
3644 * driver, we do want to have a safe RAID driver ...
3645 */
3646 mdelay(1000*1);
3647 }
3648 return NOTIFY_DONE;
3649}
3650
75c96f85 3651static struct notifier_block md_notifier = {
1da177e4
LT
3652 .notifier_call = md_notify_reboot,
3653 .next = NULL,
3654 .priority = INT_MAX, /* before any real devices */
3655};
3656
3657static void md_geninit(void)
3658{
3659 struct proc_dir_entry *p;
3660
3661 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
3662
3663 p = create_proc_entry("mdstat", S_IRUGO, NULL);
3664 if (p)
3665 p->proc_fops = &md_seq_fops;
3666}
3667
75c96f85 3668static int __init md_init(void)
1da177e4
LT
3669{
3670 int minor;
3671
3672 printk(KERN_INFO "md: md driver %d.%d.%d MAX_MD_DEVS=%d,"
3673 " MD_SB_DISKS=%d\n",
3674 MD_MAJOR_VERSION, MD_MINOR_VERSION,
3675 MD_PATCHLEVEL_VERSION, MAX_MD_DEVS, MD_SB_DISKS);
3676
3677 if (register_blkdev(MAJOR_NR, "md"))
3678 return -1;
3679 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
3680 unregister_blkdev(MAJOR_NR, "md");
3681 return -1;
3682 }
3683 devfs_mk_dir("md");
3684 blk_register_region(MKDEV(MAJOR_NR, 0), MAX_MD_DEVS, THIS_MODULE,
3685 md_probe, NULL, NULL);
3686 blk_register_region(MKDEV(mdp_major, 0), MAX_MD_DEVS<<MdpMinorShift, THIS_MODULE,
3687 md_probe, NULL, NULL);
3688
3689 for (minor=0; minor < MAX_MD_DEVS; ++minor)
3690 devfs_mk_bdev(MKDEV(MAJOR_NR, minor),
3691 S_IFBLK|S_IRUSR|S_IWUSR,
3692 "md/%d", minor);
3693
3694 for (minor=0; minor < MAX_MD_DEVS; ++minor)
3695 devfs_mk_bdev(MKDEV(mdp_major, minor<<MdpMinorShift),
3696 S_IFBLK|S_IRUSR|S_IWUSR,
3697 "md/mdp%d", minor);
3698
3699
3700 register_reboot_notifier(&md_notifier);
3701 raid_table_header = register_sysctl_table(raid_root_table, 1);
3702
3703 md_geninit();
3704 return (0);
3705}
3706
3707
3708#ifndef MODULE
3709
3710/*
3711 * Searches all registered partitions for autorun RAID arrays
3712 * at boot time.
3713 */
3714static dev_t detected_devices[128];
3715static int dev_cnt;
3716
3717void md_autodetect_dev(dev_t dev)
3718{
3719 if (dev_cnt >= 0 && dev_cnt < 127)
3720 detected_devices[dev_cnt++] = dev;
3721}
3722
3723
3724static void autostart_arrays(int part)
3725{
3726 mdk_rdev_t *rdev;
3727 int i;
3728
3729 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
3730
3731 for (i = 0; i < dev_cnt; i++) {
3732 dev_t dev = detected_devices[i];
3733
3734 rdev = md_import_device(dev,0, 0);
3735 if (IS_ERR(rdev))
3736 continue;
3737
3738 if (rdev->faulty) {
3739 MD_BUG();
3740 continue;
3741 }
3742 list_add(&rdev->same_set, &pending_raid_disks);
3743 }
3744 dev_cnt = 0;
3745
3746 autorun_devices(part);
3747}
3748
3749#endif
3750
3751static __exit void md_exit(void)
3752{
3753 mddev_t *mddev;
3754 struct list_head *tmp;
3755 int i;
3756 blk_unregister_region(MKDEV(MAJOR_NR,0), MAX_MD_DEVS);
3757 blk_unregister_region(MKDEV(mdp_major,0), MAX_MD_DEVS << MdpMinorShift);
3758 for (i=0; i < MAX_MD_DEVS; i++)
3759 devfs_remove("md/%d", i);
3760 for (i=0; i < MAX_MD_DEVS; i++)
3761 devfs_remove("md/d%d", i);
3762
3763 devfs_remove("md");
3764
3765 unregister_blkdev(MAJOR_NR,"md");
3766 unregister_blkdev(mdp_major, "mdp");
3767 unregister_reboot_notifier(&md_notifier);
3768 unregister_sysctl_table(raid_table_header);
3769 remove_proc_entry("mdstat", NULL);
3770 ITERATE_MDDEV(mddev,tmp) {
3771 struct gendisk *disk = mddev->gendisk;
3772 if (!disk)
3773 continue;
3774 export_array(mddev);
3775 del_gendisk(disk);
3776 put_disk(disk);
3777 mddev->gendisk = NULL;
3778 mddev_put(mddev);
3779 }
3780}
3781
3782module_init(md_init)
3783module_exit(md_exit)
3784
3785EXPORT_SYMBOL(register_md_personality);
3786EXPORT_SYMBOL(unregister_md_personality);
3787EXPORT_SYMBOL(md_error);
3788EXPORT_SYMBOL(md_done_sync);
3789EXPORT_SYMBOL(md_write_start);
3790EXPORT_SYMBOL(md_write_end);
1da177e4
LT
3791EXPORT_SYMBOL(md_register_thread);
3792EXPORT_SYMBOL(md_unregister_thread);
3793EXPORT_SYMBOL(md_wakeup_thread);
3794EXPORT_SYMBOL(md_print_devices);
3795EXPORT_SYMBOL(md_check_recovery);
3796MODULE_LICENSE("GPL");