| /* |
| md.c : Multiple Devices driver for Linux |
| Copyright (C) 1998, 1999, 2000 Ingo Molnar |
| |
| completely rewritten, based on the MD driver code from Marc Zyngier |
| |
| Changes: |
| |
| - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar |
| - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> |
| - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> |
| - kerneld support by Boris Tobotras <boris@xtalk.msk.su> |
| - kmod support by: Cyrus Durgin |
| - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> |
| - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> |
| |
| - lots of fixes and improvements to the RAID1/RAID5 and generic |
| RAID code (such as request based resynchronization): |
| |
| Neil Brown <neilb@cse.unsw.edu.au>. |
| |
| - persistent bitmap code |
| Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. |
| |
| This program is free software; you can redistribute it and/or modify |
| it under the terms of the GNU General Public License as published by |
| the Free Software Foundation; either version 2, or (at your option) |
| any later version. |
| |
| You should have received a copy of the GNU General Public License |
| (for example /usr/src/linux/COPYING); if not, write to the Free |
| Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| |
| Errors, Warnings, etc. |
| Please use: |
| pr_crit() for error conditions that risk data loss |
| pr_err() for error conditions that are unexpected, like an IO error |
| or internal inconsistency |
| pr_warn() for error conditions that could have been predicated, like |
| adding a device to an array when it has incompatible metadata |
| pr_info() for every interesting, very rare events, like an array starting |
| or stopping, or resync starting or stopping |
| pr_debug() for everything else. |
| |
| */ |
| |
| #include <linux/sched/signal.h> |
| #include <linux/kthread.h> |
| #include <linux/blkdev.h> |
| #include <linux/badblocks.h> |
| #include <linux/sysctl.h> |
| #include <linux/seq_file.h> |
| #include <linux/fs.h> |
| #include <linux/poll.h> |
| #include <linux/ctype.h> |
| #include <linux/string.h> |
| #include <linux/hdreg.h> |
| #include <linux/proc_fs.h> |
| #include <linux/random.h> |
| #include <linux/module.h> |
| #include <linux/reboot.h> |
| #include <linux/file.h> |
| #include <linux/compat.h> |
| #include <linux/delay.h> |
| #include <linux/raid/md_p.h> |
| #include <linux/raid/md_u.h> |
| #include <linux/slab.h> |
| #include <linux/percpu-refcount.h> |
| |
| #include <trace/events/block.h> |
| #include "md.h" |
| #include "md-bitmap.h" |
| #include "md-cluster.h" |
| |
| #ifndef MODULE |
| static void autostart_arrays(int part); |
| #endif |
| |
| /* pers_list is a list of registered personalities protected |
| * by pers_lock. |
| * pers_lock does extra service to protect accesses to |
| * mddev->thread when the mutex cannot be held. |
| */ |
| static LIST_HEAD(pers_list); |
| static DEFINE_SPINLOCK(pers_lock); |
| |
| static struct kobj_type md_ktype; |
| |
| struct md_cluster_operations *md_cluster_ops; |
| EXPORT_SYMBOL(md_cluster_ops); |
| struct module *md_cluster_mod; |
| EXPORT_SYMBOL(md_cluster_mod); |
| |
| static DECLARE_WAIT_QUEUE_HEAD(resync_wait); |
| static struct workqueue_struct *md_wq; |
| static struct workqueue_struct *md_misc_wq; |
| |
| static int remove_and_add_spares(struct mddev *mddev, |
| struct md_rdev *this); |
| static void mddev_detach(struct mddev *mddev); |
| |
| /* |
| * Default number of read corrections we'll attempt on an rdev |
| * before ejecting it from the array. We divide the read error |
| * count by 2 for every hour elapsed between read errors. |
| */ |
| #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 |
| /* |
| * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' |
| * is 1000 KB/sec, so the extra system load does not show up that much. |
| * Increase it if you want to have more _guaranteed_ speed. Note that |
| * the RAID driver will use the maximum available bandwidth if the IO |
| * subsystem is idle. There is also an 'absolute maximum' reconstruction |
| * speed limit - in case reconstruction slows down your system despite |
| * idle IO detection. |
| * |
| * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. |
| * or /sys/block/mdX/md/sync_speed_{min,max} |
| */ |
| |
| static int sysctl_speed_limit_min = 1000; |
| static int sysctl_speed_limit_max = 200000; |
| static inline int speed_min(struct mddev *mddev) |
| { |
| return mddev->sync_speed_min ? |
| mddev->sync_speed_min : sysctl_speed_limit_min; |
| } |
| |
| static inline int speed_max(struct mddev *mddev) |
| { |
| return mddev->sync_speed_max ? |
| mddev->sync_speed_max : sysctl_speed_limit_max; |
| } |
| |
| static struct ctl_table_header *raid_table_header; |
| |
| static struct ctl_table raid_table[] = { |
| { |
| .procname = "speed_limit_min", |
| .data = &sysctl_speed_limit_min, |
| .maxlen = sizeof(int), |
| .mode = S_IRUGO|S_IWUSR, |
| .proc_handler = proc_dointvec, |
| }, |
| { |
| .procname = "speed_limit_max", |
| .data = &sysctl_speed_limit_max, |
| .maxlen = sizeof(int), |
| .mode = S_IRUGO|S_IWUSR, |
| .proc_handler = proc_dointvec, |
| }, |
| { } |
| }; |
| |
| static struct ctl_table raid_dir_table[] = { |
| { |
| .procname = "raid", |
| .maxlen = 0, |
| .mode = S_IRUGO|S_IXUGO, |
| .child = raid_table, |
| }, |
| { } |
| }; |
| |
| static struct ctl_table raid_root_table[] = { |
| { |
| .procname = "dev", |
| .maxlen = 0, |
| .mode = 0555, |
| .child = raid_dir_table, |
| }, |
| { } |
| }; |
| |
| static const struct block_device_operations md_fops; |
| |
| static int start_readonly; |
| |
| /* |
| * The original mechanism for creating an md device is to create |
| * a device node in /dev and to open it. This causes races with device-close. |
| * The preferred method is to write to the "new_array" module parameter. |
| * This can avoid races. |
| * Setting create_on_open to false disables the original mechanism |
| * so all the races disappear. |
| */ |
| static bool create_on_open = true; |
| |
| struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, |
| struct mddev *mddev) |
| { |
| struct bio *b; |
| |
| if (!mddev || !bioset_initialized(&mddev->bio_set)) |
| return bio_alloc(gfp_mask, nr_iovecs); |
| |
| b = bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set); |
| if (!b) |
| return NULL; |
| return b; |
| } |
| EXPORT_SYMBOL_GPL(bio_alloc_mddev); |
| |
| static struct bio *md_bio_alloc_sync(struct mddev *mddev) |
| { |
| if (!mddev || !bioset_initialized(&mddev->sync_set)) |
| return bio_alloc(GFP_NOIO, 1); |
| |
| return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set); |
| } |
| |
| /* |
| * We have a system wide 'event count' that is incremented |
| * on any 'interesting' event, and readers of /proc/mdstat |
| * can use 'poll' or 'select' to find out when the event |
| * count increases. |
| * |
| * Events are: |
| * start array, stop array, error, add device, remove device, |
| * start build, activate spare |
| */ |
| static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); |
| static atomic_t md_event_count; |
| void md_new_event(struct mddev *mddev) |
| { |
| atomic_inc(&md_event_count); |
| wake_up(&md_event_waiters); |
| } |
| EXPORT_SYMBOL_GPL(md_new_event); |
| |
| /* |
| * Enables to iterate over all existing md arrays |
| * all_mddevs_lock protects this list. |
| */ |
| static LIST_HEAD(all_mddevs); |
| static DEFINE_SPINLOCK(all_mddevs_lock); |
| |
| /* |
| * iterates through all used mddevs in the system. |
| * We take care to grab the all_mddevs_lock whenever navigating |
| * the list, and to always hold a refcount when unlocked. |
| * Any code which breaks out of this loop while own |
| * a reference to the current mddev and must mddev_put it. |
| */ |
| #define for_each_mddev(_mddev,_tmp) \ |
| \ |
| for (({ spin_lock(&all_mddevs_lock); \ |
| _tmp = all_mddevs.next; \ |
| _mddev = NULL;}); \ |
| ({ if (_tmp != &all_mddevs) \ |
| mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ |
| spin_unlock(&all_mddevs_lock); \ |
| if (_mddev) mddev_put(_mddev); \ |
| _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ |
| _tmp != &all_mddevs;}); \ |
| ({ spin_lock(&all_mddevs_lock); \ |
| _tmp = _tmp->next;}) \ |
| ) |
| |
| /* Rather than calling directly into the personality make_request function, |
| * IO requests come here first so that we can check if the device is |
| * being suspended pending a reconfiguration. |
| * We hold a refcount over the call to ->make_request. By the time that |
| * call has finished, the bio has been linked into some internal structure |
| * and so is visible to ->quiesce(), so we don't need the refcount any more. |
| */ |
| static bool is_suspended(struct mddev *mddev, struct bio *bio) |
| { |
| if (mddev->suspended) |
| return true; |
| if (bio_data_dir(bio) != WRITE) |
| return false; |
| if (mddev->suspend_lo >= mddev->suspend_hi) |
| return false; |
| if (bio->bi_iter.bi_sector >= mddev->suspend_hi) |
| return false; |
| if (bio_end_sector(bio) < mddev->suspend_lo) |
| return false; |
| return true; |
| } |
| |
| void md_handle_request(struct mddev *mddev, struct bio *bio) |
| { |
| check_suspended: |
| rcu_read_lock(); |
| if (is_suspended(mddev, bio)) { |
| DEFINE_WAIT(__wait); |
| for (;;) { |
| prepare_to_wait(&mddev->sb_wait, &__wait, |
| TASK_UNINTERRUPTIBLE); |
| if (!is_suspended(mddev, bio)) |
| break; |
| rcu_read_unlock(); |
| schedule(); |
| rcu_read_lock(); |
| } |
| finish_wait(&mddev->sb_wait, &__wait); |
| } |
| atomic_inc(&mddev->active_io); |
| rcu_read_unlock(); |
| |
| if (!mddev->pers->make_request(mddev, bio)) { |
| atomic_dec(&mddev->active_io); |
| wake_up(&mddev->sb_wait); |
| goto check_suspended; |
| } |
| |
| if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) |
| wake_up(&mddev->sb_wait); |
| } |
| EXPORT_SYMBOL(md_handle_request); |
| |
| static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) |
| { |
| const int rw = bio_data_dir(bio); |
| const int sgrp = op_stat_group(bio_op(bio)); |
| struct mddev *mddev = q->queuedata; |
| unsigned int sectors; |
| int cpu; |
| |
| blk_queue_split(q, &bio); |
| |
| if (mddev == NULL || mddev->pers == NULL) { |
| bio_io_error(bio); |
| return BLK_QC_T_NONE; |
| } |
| if (mddev->ro == 1 && unlikely(rw == WRITE)) { |
| if (bio_sectors(bio) != 0) |
| bio->bi_status = BLK_STS_IOERR; |
| bio_endio(bio); |
| return BLK_QC_T_NONE; |
| } |
| |
| /* |
| * save the sectors now since our bio can |
| * go away inside make_request |
| */ |
| sectors = bio_sectors(bio); |
| /* bio could be mergeable after passing to underlayer */ |
| bio->bi_opf &= ~REQ_NOMERGE; |
| |
| md_handle_request(mddev, bio); |
| |
| cpu = part_stat_lock(); |
| part_stat_inc(cpu, &mddev->gendisk->part0, ios[sgrp]); |
| part_stat_add(cpu, &mddev->gendisk->part0, sectors[sgrp], sectors); |
| part_stat_unlock(); |
| |
| return BLK_QC_T_NONE; |
| } |
| |
| /* mddev_suspend makes sure no new requests are submitted |
| * to the device, and that any requests that have been submitted |
| * are completely handled. |
| * Once mddev_detach() is called and completes, the module will be |
| * completely unused. |
| */ |
| void mddev_suspend(struct mddev *mddev) |
| { |
| WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); |
| lockdep_assert_held(&mddev->reconfig_mutex); |
| if (mddev->suspended++) |
| return; |
| synchronize_rcu(); |
| wake_up(&mddev->sb_wait); |
| set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); |
| smp_mb__after_atomic(); |
| wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); |
| mddev->pers->quiesce(mddev, 1); |
| clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); |
| wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); |
| |
| del_timer_sync(&mddev->safemode_timer); |
| } |
| EXPORT_SYMBOL_GPL(mddev_suspend); |
| |
| void mddev_resume(struct mddev *mddev) |
| { |
| lockdep_assert_held(&mddev->reconfig_mutex); |
| if (--mddev->suspended) |
| return; |
| wake_up(&mddev->sb_wait); |
| mddev->pers->quiesce(mddev, 0); |
| |
| set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| md_wakeup_thread(mddev->thread); |
| md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ |
| } |
| EXPORT_SYMBOL_GPL(mddev_resume); |
| |
| int mddev_congested(struct mddev *mddev, int bits) |
| { |
| struct md_personality *pers = mddev->pers; |
| int ret = 0; |
| |
| rcu_read_lock(); |
| if (mddev->suspended) |
| ret = 1; |
| else if (pers && pers->congested) |
| ret = pers->congested(mddev, bits); |
| rcu_read_unlock(); |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(mddev_congested); |
| static int md_congested(void *data, int bits) |
| { |
| struct mddev *mddev = data; |
| return mddev_congested(mddev, bits); |
| } |
| |
| /* |
| * Generic flush handling for md |
| */ |
| |
| static void md_end_flush(struct bio *bio) |
| { |
| struct md_rdev *rdev = bio->bi_private; |
| struct mddev *mddev = rdev->mddev; |
| |
| rdev_dec_pending(rdev, mddev); |
| |
| if (atomic_dec_and_test(&mddev->flush_pending)) { |
| /* The pre-request flush has finished */ |
| queue_work(md_wq, &mddev->flush_work); |
| } |
| bio_put(bio); |
| } |
| |
| static void md_submit_flush_data(struct work_struct *ws); |
| |
| static void submit_flushes(struct work_struct *ws) |
| { |
| struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
| struct md_rdev *rdev; |
| |
| mddev->start_flush = ktime_get_boottime(); |
| INIT_WORK(&mddev->flush_work, md_submit_flush_data); |
| atomic_set(&mddev->flush_pending, 1); |
| rcu_read_lock(); |
| rdev_for_each_rcu(rdev, mddev) |
| if (rdev->raid_disk >= 0 && |
| !test_bit(Faulty, &rdev->flags)) { |
| /* Take two references, one is dropped |
| * when request finishes, one after |
| * we reclaim rcu_read_lock |
| */ |
| struct bio *bi; |
| atomic_inc(&rdev->nr_pending); |
| atomic_inc(&rdev->nr_pending); |
| rcu_read_unlock(); |
| bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); |
| bi->bi_end_io = md_end_flush; |
| bi->bi_private = rdev; |
| bio_set_dev(bi, rdev->bdev); |
| bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; |
| atomic_inc(&mddev->flush_pending); |
| submit_bio(bi); |
| rcu_read_lock(); |
| rdev_dec_pending(rdev, mddev); |
| } |
| rcu_read_unlock(); |
| if (atomic_dec_and_test(&mddev->flush_pending)) |
| queue_work(md_wq, &mddev->flush_work); |
| } |
| |
| static void md_submit_flush_data(struct work_struct *ws) |
| { |
| struct mddev *mddev = container_of(ws, struct mddev, flush_work); |
| struct bio *bio = mddev->flush_bio; |
| |
| /* |
| * must reset flush_bio before calling into md_handle_request to avoid a |
| * deadlock, because other bios passed md_handle_request suspend check |
| * could wait for this and below md_handle_request could wait for those |
| * bios because of suspend check |
| */ |
| mddev->last_flush = mddev->start_flush; |
| mddev->flush_bio = NULL; |
| wake_up(&mddev->sb_wait); |
| |
| if (bio->bi_iter.bi_size == 0) { |
| /* an empty barrier - all done */ |
| bio_endio(bio); |
| } else { |
| bio->bi_opf &= ~REQ_PREFLUSH; |
| md_handle_request(mddev, bio); |
| } |
| } |
| |
| /* |
| * Manages consolidation of flushes and submitting any flushes needed for |
| * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is |
| * being finished in another context. Returns false if the flushing is |
| * complete but still needs the I/O portion of the bio to be processed. |
| */ |
| bool md_flush_request(struct mddev *mddev, struct bio *bio) |
| { |
| ktime_t start = ktime_get_boottime(); |
| spin_lock_irq(&mddev->lock); |
| wait_event_lock_irq(mddev->sb_wait, |
| !mddev->flush_bio || |
| ktime_after(mddev->last_flush, start), |
| mddev->lock); |
| if (!ktime_after(mddev->last_flush, start)) { |
| WARN_ON(mddev->flush_bio); |
| mddev->flush_bio = bio; |
| bio = NULL; |
| } |
| spin_unlock_irq(&mddev->lock); |
| |
| if (!bio) { |
| INIT_WORK(&mddev->flush_work, submit_flushes); |
| queue_work(md_wq, &mddev->flush_work); |
| } else { |
| /* flush was performed for some other bio while we waited. */ |
| if (bio->bi_iter.bi_size == 0) |
| /* an empty barrier - all done */ |
| bio_endio(bio); |
| else { |
| bio->bi_opf &= ~REQ_PREFLUSH; |
| return false; |
| } |
| } |
| return true; |
| } |
| EXPORT_SYMBOL(md_flush_request); |
| |
| static inline struct mddev *mddev_get(struct mddev *mddev) |
| { |
| atomic_inc(&mddev->active); |
| return mddev; |
| } |
| |
| static void mddev_delayed_delete(struct work_struct *ws); |
| |
| static void mddev_put(struct mddev *mddev) |
| { |
| if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) |
| return; |
| if (!mddev->raid_disks && list_empty(&mddev->disks) && |
| mddev->ctime == 0 && !mddev->hold_active) { |
| /* Array is not configured at all, and not held active, |
| * so destroy it */ |
| list_del_init(&mddev->all_mddevs); |
| |
| /* |
| * Call queue_work inside the spinlock so that |
| * flush_workqueue() after mddev_find will succeed in waiting |
| * for the work to be done. |
| */ |
| INIT_WORK(&mddev->del_work, mddev_delayed_delete); |
| queue_work(md_misc_wq, &mddev->del_work); |
| } |
| spin_unlock(&all_mddevs_lock); |
| } |
| |
| static void md_safemode_timeout(struct timer_list *t); |
| |
| void mddev_init(struct mddev *mddev) |
| { |
| kobject_init(&mddev->kobj, &md_ktype); |
| mutex_init(&mddev->open_mutex); |
| mutex_init(&mddev->reconfig_mutex); |
| mutex_init(&mddev->bitmap_info.mutex); |
| INIT_LIST_HEAD(&mddev->disks); |
| INIT_LIST_HEAD(&mddev->all_mddevs); |
| timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0); |
| atomic_set(&mddev->active, 1); |
| atomic_set(&mddev->openers, 0); |
| atomic_set(&mddev->active_io, 0); |
| spin_lock_init(&mddev->lock); |
| atomic_set(&mddev->flush_pending, 0); |
| init_waitqueue_head(&mddev->sb_wait); |
| init_waitqueue_head(&mddev->recovery_wait); |
| mddev->reshape_position = MaxSector; |
| mddev->reshape_backwards = 0; |
| mddev->last_sync_action = "none"; |
| mddev->resync_min = 0; |
| mddev->resync_max = MaxSector; |
| mddev->level = LEVEL_NONE; |
| } |
| EXPORT_SYMBOL_GPL(mddev_init); |
| |
| static struct mddev *mddev_find(dev_t unit) |
| { |
| struct mddev *mddev, *new = NULL; |
| |
| if (unit && MAJOR(unit) != MD_MAJOR) |
| unit &= ~((1<<MdpMinorShift)-1); |
| |
| retry: |
| spin_lock(&all_mddevs_lock); |
| |
| if (unit) { |
| list_for_each_entry(mddev, &all_mddevs, all_mddevs) |
| if (mddev->unit == unit) { |
| mddev_get(mddev); |
| spin_unlock(&all_mddevs_lock); |
| kfree(new); |
| return mddev; |
| } |
| |
| if (new) { |
| list_add(&new->all_mddevs, &all_mddevs); |
| spin_unlock(&all_mddevs_lock); |
| new->hold_active = UNTIL_IOCTL; |
| return new; |
| } |
| } else if (new) { |
| /* find an unused unit number */ |
| static int next_minor = 512; |
| int start = next_minor; |
| int is_free = 0; |
| int dev = 0; |
| while (!is_free) { |
| dev = MKDEV(MD_MAJOR, next_minor); |
| next_minor++; |
| if (next_minor > MINORMASK) |
| next_minor = 0; |
| if (next_minor == start) { |
| /* Oh dear, all in use. */ |
| spin_unlock(&all_mddevs_lock); |
| kfree(new); |
| return NULL; |
| } |
| |
| is_free = 1; |
| list_for_each_entry(mddev, &all_mddevs, all_mddevs) |
| if (mddev->unit == dev) { |
| is_free = 0; |
| break; |
| } |
| } |
| new->unit = dev; |
| new->md_minor = MINOR(dev); |
| new->hold_active = UNTIL_STOP; |
| list_add(&new->all_mddevs, &all_mddevs); |
| spin_unlock(&all_mddevs_lock); |
| return new; |
| } |
| spin_unlock(&all_mddevs_lock); |
| |
| new = kzalloc(sizeof(*new), GFP_KERNEL); |
| if (!new) |
| return NULL; |
| |
| new->unit = unit; |
| if (MAJOR(unit) == MD_MAJOR) |
| new->md_minor = MINOR(unit); |
| else |
| new->md_minor = MINOR(unit) >> MdpMinorShift; |
| |
| mddev_init(new); |
| |
| goto retry; |
| } |
| |
| static struct attribute_group md_redundancy_group; |
| |
| void mddev_unlock(struct mddev *mddev) |
| { |
| if (mddev->to_remove) { |
| /* These cannot be removed under reconfig_mutex as |
| * an access to the files will try to take reconfig_mutex |
| * while holding the file unremovable, which leads to |
| * a deadlock. |
| * So hold set sysfs_active while the remove in happeing, |
| * and anything else which might set ->to_remove or my |
| * otherwise change the sysfs namespace will fail with |
| * -EBUSY if sysfs_active is still set. |
| * We set sysfs_active under reconfig_mutex and elsewhere |
| * test it under the same mutex to ensure its correct value |
| * is seen. |
| */ |
| struct attribute_group *to_remove = mddev->to_remove; |
| mddev->to_remove = NULL; |
| mddev->sysfs_active = 1; |
| mutex_unlock(&mddev->reconfig_mutex); |
| |
| if (mddev->kobj.sd) { |
| if (to_remove != &md_redundancy_group) |
| sysfs_remove_group(&mddev->kobj, to_remove); |
| if (mddev->pers == NULL || |
| mddev->pers->sync_request == NULL) { |
| sysfs_remove_group(&mddev->kobj, &md_redundancy_group); |
| if (mddev->sysfs_action) |
| sysfs_put(mddev->sysfs_action); |
| mddev->sysfs_action = NULL; |
| } |
| } |
| mddev->sysfs_active = 0; |
| } else |
| mutex_unlock(&mddev->reconfig_mutex); |
| |
| /* As we've dropped the mutex we need a spinlock to |
| * make sure the thread doesn't disappear |
| */ |
| spin_lock(&pers_lock); |
| md_wakeup_thread(mddev->thread); |
| wake_up(&mddev->sb_wait); |
| spin_unlock(&pers_lock); |
| } |
| EXPORT_SYMBOL_GPL(mddev_unlock); |
| |
| struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr) |
| { |
| struct md_rdev *rdev; |
| |
| rdev_for_each_rcu(rdev, mddev) |
| if (rdev->desc_nr == nr) |
| return rdev; |
| |
| return NULL; |
| } |
| EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu); |
| |
| static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) |
| { |
| struct md_rdev *rdev; |
| |
| rdev_for_each(rdev, mddev) |
| if (rdev->bdev->bd_dev == dev) |
| return rdev; |
| |
| return NULL; |
| } |
| |
| struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev) |
| { |
| struct md_rdev *rdev; |
| |
| rdev_for_each_rcu(rdev, mddev) |
| if (rdev->bdev->bd_dev == dev) |
| return rdev; |
| |
| return NULL; |
| } |
| EXPORT_SYMBOL_GPL(md_find_rdev_rcu); |
| |
| static struct md_personality *find_pers(int level, char *clevel) |
| { |
| struct md_personality *pers; |
| list_for_each_entry(pers, &pers_list, list) { |
| if (level != LEVEL_NONE && pers->level == level) |
| return pers; |
| if (strcmp(pers->name, clevel)==0) |
| return pers; |
| } |
| return NULL; |
| } |
| |
| /* return the offset of the super block in 512byte sectors */ |
| static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) |
| { |
| sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; |
| return MD_NEW_SIZE_SECTORS(num_sectors); |
| } |
| |
| static int alloc_disk_sb(struct md_rdev *rdev) |
| { |
| rdev->sb_page = alloc_page(GFP_KERNEL); |
| if (!rdev->sb_page) |
| return -ENOMEM; |
| return 0; |
| } |
| |
| void md_rdev_clear(struct md_rdev *rdev) |
| { |
| if (rdev->sb_page) { |
| put_page(rdev->sb_page); |
| rdev->sb_loaded = 0; |
| rdev->sb_page = NULL; |
| rdev->sb_start = 0; |
| rdev->sectors = 0; |
| } |
| if (rdev->bb_page) { |
| put_page(rdev->bb_page); |
| rdev->bb_page = NULL; |
| } |
| badblocks_exit(&rdev->badblocks); |
| } |
| EXPORT_SYMBOL_GPL(md_rdev_clear); |
| |
| static void super_written(struct bio *bio) |
| { |
| struct md_rdev *rdev = bio->bi_private; |
| struct mddev *mddev = rdev->mddev; |
| |
| if (bio->bi_status) { |
| pr_err("md: super_written gets error=%d\n", bio->bi_status); |
| md_error(mddev, rdev); |
| if (!test_bit(Faulty, &rdev->flags) |
| && (bio->bi_opf & MD_FAILFAST)) { |
| set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags); |
| set_bit(LastDev, &rdev->flags); |
| } |
| } else |
| clear_bit(LastDev, &rdev->flags); |
| |
| if (atomic_dec_and_test(&mddev->pending_writes)) |
| wake_up(&mddev->sb_wait); |
| rdev_dec_pending(rdev, mddev); |
| bio_put(bio); |
| } |
| |
| void md_super_write(struct mddev *mddev, struct md_rdev *rdev, |
| sector_t sector, int size, struct page *page) |
| { |
| /* write first size bytes of page to sector of rdev |
| * Increment mddev->pending_writes before returning |
| * and decrement it on completion, waking up sb_wait |
| * if zero is reached. |
| * If an error occurred, call md_error |
| */ |
| struct bio *bio; |
| int ff = 0; |
| |
| if (!page) |
| return; |
| |
| if (test_bit(Faulty, &rdev->flags)) |
| return; |
| |
| bio = md_bio_alloc_sync(mddev); |
| |
| atomic_inc(&rdev->nr_pending); |
| |
| bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev); |
| bio->bi_iter.bi_sector = sector; |
| bio_add_page(bio, page, size, 0); |
| bio->bi_private = rdev; |
| bio->bi_end_io = super_written; |
| |
| if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && |
| test_bit(FailFast, &rdev->flags) && |
| !test_bit(LastDev, &rdev->flags)) |
| ff = MD_FAILFAST; |
| bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff; |
| |
| atomic_inc(&mddev->pending_writes); |
| submit_bio(bio); |
| } |
| |
| int md_super_wait(struct mddev *mddev) |
| { |
| /* wait for all superblock writes that were scheduled to complete */ |
| wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0); |
| if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags)) |
| return -EAGAIN; |
| return 0; |
| } |
| |
| int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, |
| struct page *page, int op, int op_flags, bool metadata_op) |
| { |
| struct bio *bio = md_bio_alloc_sync(rdev->mddev); |
| int ret; |
| |
| if (metadata_op && rdev->meta_bdev) |
| bio_set_dev(bio, rdev->meta_bdev); |
| else |
| bio_set_dev(bio, rdev->bdev); |
| bio_set_op_attrs(bio, op, op_flags); |
| if (metadata_op) |
| bio->bi_iter.bi_sector = sector + rdev->sb_start; |
| else if (rdev->mddev->reshape_position != MaxSector && |
| (rdev->mddev->reshape_backwards == |
| (sector >= rdev->mddev->reshape_position))) |
| bio->bi_iter.bi_sector = sector + rdev->new_data_offset; |
| else |
| bio->bi_iter.bi_sector = sector + rdev->data_offset; |
| bio_add_page(bio, page, size, 0); |
| |
| submit_bio_wait(bio); |
| |
| ret = !bio->bi_status; |
| bio_put(bio); |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(sync_page_io); |
| |
| static int read_disk_sb(struct md_rdev *rdev, int size) |
| { |
| char b[BDEVNAME_SIZE]; |
| |
| if (rdev->sb_loaded) |
| return 0; |
| |
| if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true)) |
| goto fail; |
| rdev->sb_loaded = 1; |
| return 0; |
| |
| fail: |
| pr_err("md: disabled device %s, could not read superblock.\n", |
| bdevname(rdev->bdev,b)); |
| return -EINVAL; |
| } |
| |
| static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
| { |
| return sb1->set_uuid0 == sb2->set_uuid0 && |
| sb1->set_uuid1 == sb2->set_uuid1 && |
| sb1->set_uuid2 == sb2->set_uuid2 && |
| sb1->set_uuid3 == sb2->set_uuid3; |
| } |
| |
| static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) |
| { |
| int ret; |
| mdp_super_t *tmp1, *tmp2; |
| |
| tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); |
| tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); |
| |
| if (!tmp1 || !tmp2) { |
| ret = 0; |
| goto abort; |
| } |
| |
| *tmp1 = *sb1; |
| *tmp2 = *sb2; |
| |
| /* |
| * nr_disks is not constant |
| */ |
| tmp1->nr_disks = 0; |
| tmp2->nr_disks = 0; |
| |
| ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); |
| abort: |
| kfree(tmp1); |
| kfree(tmp2); |
| return ret; |
| } |
| |
| static u32 md_csum_fold(u32 csum) |
| { |
| csum = (csum & 0xffff) + (csum >> 16); |
| return (csum & 0xffff) + (csum >> 16); |
| } |
| |
| static unsigned int calc_sb_csum(mdp_super_t *sb) |
| { |
| u64 newcsum = 0; |
| u32 *sb32 = (u32*)sb; |
| int i; |
| unsigned int disk_csum, csum; |
| |
| disk_csum = sb->sb_csum; |
| sb->sb_csum = 0; |
| |
| for (i = 0; i < MD_SB_BYTES/4 ; i++) |
| newcsum += sb32[i]; |
| csum = (newcsum & 0xffffffff) + (newcsum>>32); |
| |
| #ifdef CONFIG_ALPHA |
| /* This used to use csum_partial, which was wrong for several |
| * reasons including that different results are returned on |
| * different architectures. It isn't critical that we get exactly |
| * the same return value as before (we always csum_fold before |
| * testing, and that removes any differences). However as we |
| * know that csum_partial always returned a 16bit value on |
| * alphas, do a fold to maximise conformity to previous behaviour. |
| */ |
| sb->sb_csum = md_csum_fold(disk_csum); |
| #else |
| sb->sb_csum = disk_csum; |
| #endif |
| return csum; |
| } |
| |
| /* |
| * Handle superblock details. |
| * We want to be able to handle multiple superblock formats |
| * so we have a common interface to them all, and an array of |
| * different handlers. |
| * We rely on user-space to write the initial superblock, and support |
| * reading and updating of superblocks. |
| * Interface methods are: |
| * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) |
| * loads and validates a superblock on dev. |
| * if refdev != NULL, compare superblocks on both devices |
| * Return: |
| * 0 - dev has a superblock that is compatible with refdev |
| * 1 - dev has a superblock that is compatible and newer than refdev |
| * so dev should be used as the refdev in future |
| * -EINVAL superblock incompatible or invalid |
| * -othererror e.g. -EIO |
| * |
| * int validate_super(struct mddev *mddev, struct md_rdev *dev) |
| * Verify that dev is acceptable into mddev. |
| * The first time, mddev->raid_disks will be 0, and data from |
| * dev should be merged in. Subsequent calls check that dev |
| * is new enough. Return 0 or -EINVAL |
| * |
| * void sync_super(struct mddev *mddev, struct md_rdev *dev) |
| * Update the superblock for rdev with data in mddev |
| * This does not write to disc. |
| * |
| */ |
| |
| struct super_type { |
| char *name; |
| struct module *owner; |
| int (*load_super)(struct md_rdev *rdev, |
| struct md_rdev *refdev, |
| int minor_version); |
| int (*validate_super)(struct mddev *mddev, |
| struct md_rdev *rdev); |
| void (*sync_super)(struct mddev *mddev, |
| struct md_rdev *rdev); |
| unsigned long long (*rdev_size_change)(struct md_rdev *rdev, |
| sector_t num_sectors); |
| int (*allow_new_offset)(struct md_rdev *rdev, |
| unsigned long long new_offset); |
| }; |
| |
| /* |
| * Check that the given mddev has no bitmap. |
| * |
| * This function is called from the run method of all personalities that do not |
| * support bitmaps. It prints an error message and returns non-zero if mddev |
| * has a bitmap. Otherwise, it returns 0. |
| * |
| */ |
| int md_check_no_bitmap(struct mddev *mddev) |
| { |
| if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) |
| return 0; |
| pr_warn("%s: bitmaps are not supported for %s\n", |
| mdname(mddev), mddev->pers->name); |
| return 1; |
| } |
| EXPORT_SYMBOL(md_check_no_bitmap); |
| |
| /* |
| * load_super for 0.90.0 |
| */ |
| static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) |
| { |
| char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
| mdp_super_t *sb; |
| int ret; |
| |
| /* |
| * Calculate the position of the superblock (512byte sectors), |
| * it's at the end of the disk. |
| * |
| * It also happens to be a multiple of 4Kb. |
| */ |
| rdev->sb_start = calc_dev_sboffset(rdev); |
| |
| ret = read_disk_sb(rdev, MD_SB_BYTES); |
| if (ret) |
| return ret; |
| |
| ret = -EINVAL; |
| |
| bdevname(rdev->bdev, b); |
| sb = page_address(rdev->sb_page); |
| |
| if (sb->md_magic != MD_SB_MAGIC) { |
| pr_warn("md: invalid raid superblock magic on %s\n", b); |
| goto abort; |
| } |
| |
| if (sb->major_version != 0 || |
| sb->minor_version < 90 || |
| sb->minor_version > 91) { |
| pr_warn("Bad version number %d.%d on %s\n", |
| sb->major_version, sb->minor_version, b); |
| goto abort; |
| } |
| |
| if (sb->raid_disks <= 0) |
| goto abort; |
| |
| if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { |
| pr_warn("md: invalid superblock checksum on %s\n", b); |
| goto abort; |
| } |
| |
| rdev->preferred_minor = sb->md_minor; |
| rdev->data_offset = 0; |
| rdev->new_data_offset = 0; |
| rdev->sb_size = MD_SB_BYTES; |
| rdev->badblocks.shift = -1; |
| |
| if (sb->level == LEVEL_MULTIPATH) |
| rdev->desc_nr = -1; |
| else |
| rdev->desc_nr = sb->this_disk.number; |
| |
| if (!refdev) { |
| ret = 1; |
| } else { |
| __u64 ev1, ev2; |
| mdp_super_t *refsb = page_address(refdev->sb_page); |
| if (!md_uuid_equal(refsb, sb)) { |
| pr_warn("md: %s has different UUID to %s\n", |
| b, bdevname(refdev->bdev,b2)); |
| goto abort; |
| } |
| if (!md_sb_equal(refsb, sb)) { |
| pr_warn("md: %s has same UUID but different superblock to %s\n", |
| b, bdevname(refdev->bdev, b2)); |
| goto abort; |
| } |
| ev1 = md_event(sb); |
| ev2 = md_event(refsb); |
| if (ev1 > ev2) |
| ret = 1; |
| else |
| ret = 0; |
| } |
| rdev->sectors = rdev->sb_start; |
| /* Limit to 4TB as metadata cannot record more than that. |
| * (not needed for Linear and RAID0 as metadata doesn't |
| * record this size) |
| */ |
| if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) && |
| sb->level >= 1) |
| rdev->sectors = (sector_t)(2ULL << 32) - 2; |
| |
| if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) |
| /* "this cannot possibly happen" ... */ |
| ret = -EINVAL; |
| |
| abort: |
| return ret; |
| } |
| |
| /* |
| * validate_super for 0.90.0 |
| */ |
| static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| mdp_disk_t *desc; |
| mdp_super_t *sb = page_address(rdev->sb_page); |
| __u64 ev1 = md_event(sb); |
| |
| rdev->raid_disk = -1; |
| clear_bit(Faulty, &rdev->flags); |
| clear_bit(In_sync, &rdev->flags); |
| clear_bit(Bitmap_sync, &rdev->flags); |
| clear_bit(WriteMostly, &rdev->flags); |
| |
| if (mddev->raid_disks == 0) { |
| mddev->major_version = 0; |
| mddev->minor_version = sb->minor_version; |
| mddev->patch_version = sb->patch_version; |
| mddev->external = 0; |
| mddev->chunk_sectors = sb->chunk_size >> 9; |
| mddev->ctime = sb->ctime; |
| mddev->utime = sb->utime; |
| mddev->level = sb->level; |
| mddev->clevel[0] = 0; |
| mddev->layout = sb->layout; |
| mddev->raid_disks = sb->raid_disks; |
| mddev->dev_sectors = ((sector_t)sb->size) * 2; |
| mddev->events = ev1; |
| mddev->bitmap_info.offset = 0; |
| mddev->bitmap_info.space = 0; |
| /* bitmap can use 60 K after the 4K superblocks */ |
| mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; |
| mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); |
| mddev->reshape_backwards = 0; |
| |
| if (mddev->minor_version >= 91) { |
| mddev->reshape_position = sb->reshape_position; |
| mddev->delta_disks = sb->delta_disks; |
| mddev->new_level = sb->new_level; |
| mddev->new_layout = sb->new_layout; |
| mddev->new_chunk_sectors = sb->new_chunk >> 9; |
| if (mddev->delta_disks < 0) |
| mddev->reshape_backwards = 1; |
| } else { |
| mddev->reshape_position = MaxSector; |
| mddev->delta_disks = 0; |
| mddev->new_level = mddev->level; |
| mddev->new_layout = mddev->layout; |
| mddev->new_chunk_sectors = mddev->chunk_sectors; |
| } |
| |
| if (sb->state & (1<<MD_SB_CLEAN)) |
| mddev->recovery_cp = MaxSector; |
| else { |
| if (sb->events_hi == sb->cp_events_hi && |
| sb->events_lo == sb->cp_events_lo) { |
| mddev->recovery_cp = sb->recovery_cp; |
| } else |
| mddev->recovery_cp = 0; |
| } |
| |
| memcpy(mddev->uuid+0, &sb->set_uuid0, 4); |
| memcpy(mddev->uuid+4, &sb->set_uuid1, 4); |
| memcpy(mddev->uuid+8, &sb->set_uuid2, 4); |
| memcpy(mddev->uuid+12,&sb->set_uuid3, 4); |
| |
| mddev->max_disks = MD_SB_DISKS; |
| |
| if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && |
| mddev->bitmap_info.file == NULL) { |
| mddev->bitmap_info.offset = |
| mddev->bitmap_info.default_offset; |
| mddev->bitmap_info.space = |
| mddev->bitmap_info.default_space; |
| } |
| |
| } else if (mddev->pers == NULL) { |
| /* Insist on good event counter while assembling, except |
| * for spares (which don't need an event count) */ |
| ++ev1; |
| if (sb->disks[rdev->desc_nr].state & ( |
| (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) |
| if (ev1 < mddev->events) |
| return -EINVAL; |
| } else if (mddev->bitmap) { |
| /* if adding to array with a bitmap, then we can accept an |
| * older device ... but not too old. |
| */ |
| if (ev1 < mddev->bitmap->events_cleared) |
| return 0; |
| if (ev1 < mddev->events) |
| set_bit(Bitmap_sync, &rdev->flags); |
| } else { |
| if (ev1 < mddev->events) |
| /* just a hot-add of a new device, leave raid_disk at -1 */ |
| return 0; |
| } |
| |
| if (mddev->level != LEVEL_MULTIPATH) { |
| desc = sb->disks + rdev->desc_nr; |
| |
| if (desc->state & (1<<MD_DISK_FAULTY)) |
| set_bit(Faulty, &rdev->flags); |
| else if (desc->state & (1<<MD_DISK_SYNC) /* && |
| desc->raid_disk < mddev->raid_disks */) { |
| set_bit(In_sync, &rdev->flags); |
| rdev->raid_disk = desc->raid_disk; |
| rdev->saved_raid_disk = desc->raid_disk; |
| } else if (desc->state & (1<<MD_DISK_ACTIVE)) { |
| /* active but not in sync implies recovery up to |
| * reshape position. We don't know exactly where |
| * that is, so set to zero for now */ |
| if (mddev->minor_version >= 91) { |
| rdev->recovery_offset = 0; |
| rdev->raid_disk = desc->raid_disk; |
| } |
| } |
| if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) |
| set_bit(WriteMostly, &rdev->flags); |
| if (desc->state & (1<<MD_DISK_FAILFAST)) |
| set_bit(FailFast, &rdev->flags); |
| } else /* MULTIPATH are always insync */ |
| set_bit(In_sync, &rdev->flags); |
| return 0; |
| } |
| |
| /* |
| * sync_super for 0.90.0 |
| */ |
| static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| mdp_super_t *sb; |
| struct md_rdev *rdev2; |
| int next_spare = mddev->raid_disks; |
| |
| /* make rdev->sb match mddev data.. |
| * |
| * 1/ zero out disks |
| * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); |
| * 3/ any empty disks < next_spare become removed |
| * |
| * disks[0] gets initialised to REMOVED because |
| * we cannot be sure from other fields if it has |
| * been initialised or not. |
| */ |
| int i; |
| int active=0, working=0,failed=0,spare=0,nr_disks=0; |
| |
| rdev->sb_size = MD_SB_BYTES; |
| |
| sb = page_address(rdev->sb_page); |
| |
| memset(sb, 0, sizeof(*sb)); |
| |
| sb->md_magic = MD_SB_MAGIC; |
| sb->major_version = mddev->major_version; |
| sb->patch_version = mddev->patch_version; |
| sb->gvalid_words = 0; /* ignored */ |
| memcpy(&sb->set_uuid0, mddev->uuid+0, 4); |
| memcpy(&sb->set_uuid1, mddev->uuid+4, 4); |
| memcpy(&sb->set_uuid2, mddev->uuid+8, 4); |
| memcpy(&sb->set_uuid3, mddev->uuid+12,4); |
| |
| sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX); |
| sb->level = mddev->level; |
| sb->size = mddev->dev_sectors / 2; |
| sb->raid_disks = mddev->raid_disks; |
| sb->md_minor = mddev->md_minor; |
| sb->not_persistent = 0; |
| sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX); |
| sb->state = 0; |
| sb->events_hi = (mddev->events>>32); |
| sb->events_lo = (u32)mddev->events; |
| |
| if (mddev->reshape_position == MaxSector) |
| sb->minor_version = 90; |
| else { |
| sb->minor_version = 91; |
| sb->reshape_position = mddev->reshape_position; |
| sb->new_level = mddev->new_level; |
| sb->delta_disks = mddev->delta_disks; |
| sb->new_layout = mddev->new_layout; |
| sb->new_chunk = mddev->new_chunk_sectors << 9; |
| } |
| mddev->minor_version = sb->minor_version; |
| if (mddev->in_sync) |
| { |
| sb->recovery_cp = mddev->recovery_cp; |
| sb->cp_events_hi = (mddev->events>>32); |
| sb->cp_events_lo = (u32)mddev->events; |
| if (mddev->recovery_cp == MaxSector) |
| sb->state = (1<< MD_SB_CLEAN); |
| } else |
| sb->recovery_cp = 0; |
| |
| sb->layout = mddev->layout; |
| sb->chunk_size = mddev->chunk_sectors << 9; |
| |
| if (mddev->bitmap && mddev->bitmap_info.file == NULL) |
| sb->state |= (1<<MD_SB_BITMAP_PRESENT); |
| |
| sb->disks[0].state = (1<<MD_DISK_REMOVED); |
| rdev_for_each(rdev2, mddev) { |
| mdp_disk_t *d; |
| int desc_nr; |
| int is_active = test_bit(In_sync, &rdev2->flags); |
| |
| if (rdev2->raid_disk >= 0 && |
| sb->minor_version >= 91) |
| /* we have nowhere to store the recovery_offset, |
| * but if it is not below the reshape_position, |
| * we can piggy-back on that. |
| */ |
| is_active = 1; |
| if (rdev2->raid_disk < 0 || |
| test_bit(Faulty, &rdev2->flags)) |
| is_active = 0; |
| if (is_active) |
| desc_nr = rdev2->raid_disk; |
| else |
| desc_nr = next_spare++; |
| rdev2->desc_nr = desc_nr; |
| d = &sb->disks[rdev2->desc_nr]; |
| nr_disks++; |
| d->number = rdev2->desc_nr; |
| d->major = MAJOR(rdev2->bdev->bd_dev); |
| d->minor = MINOR(rdev2->bdev->bd_dev); |
| if (is_active) |
| d->raid_disk = rdev2->raid_disk; |
| else |
| d->raid_disk = rdev2->desc_nr; /* compatibility */ |
| if (test_bit(Faulty, &rdev2->flags)) |
| d->state = (1<<MD_DISK_FAULTY); |
| else if (is_active) { |
| d->state = (1<<MD_DISK_ACTIVE); |
| if (test_bit(In_sync, &rdev2->flags)) |
| d->state |= (1<<MD_DISK_SYNC); |
| active++; |
| working++; |
| } else { |
| d->state = 0; |
| spare++; |
| working++; |
| } |
| if (test_bit(WriteMostly, &rdev2->flags)) |
| d->state |= (1<<MD_DISK_WRITEMOSTLY); |
| if (test_bit(FailFast, &rdev2->flags)) |
| d->state |= (1<<MD_DISK_FAILFAST); |
| } |
| /* now set the "removed" and "faulty" bits on any missing devices */ |
| for (i=0 ; i < mddev->raid_disks ; i++) { |
| mdp_disk_t *d = &sb->disks[i]; |
| if (d->state == 0 && d->number == 0) { |
| d->number = i; |
| d->raid_disk = i; |
| d->state = (1<<MD_DISK_REMOVED); |
| d->state |= (1<<MD_DISK_FAULTY); |
| failed++; |
| } |
| } |
| sb->nr_disks = nr_disks; |
| sb->active_disks = active; |
| sb->working_disks = working; |
| sb->failed_disks = failed; |
| sb->spare_disks = spare; |
| |
| sb->this_disk = sb->disks[rdev->desc_nr]; |
| sb->sb_csum = calc_sb_csum(sb); |
| } |
| |
| /* |
| * rdev_size_change for 0.90.0 |
| */ |
| static unsigned long long |
| super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) |
| { |
| if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
| return 0; /* component must fit device */ |
| if (rdev->mddev->bitmap_info.offset) |
| return 0; /* can't move bitmap */ |
| rdev->sb_start = calc_dev_sboffset(rdev); |
| if (!num_sectors || num_sectors > rdev->sb_start) |
| num_sectors = rdev->sb_start; |
| /* Limit to 4TB as metadata cannot record more than that. |
| * 4TB == 2^32 KB, or 2*2^32 sectors. |
| */ |
| if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) && |
| rdev->mddev->level >= 1) |
| num_sectors = (sector_t)(2ULL << 32) - 2; |
| do { |
| md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
| rdev->sb_page); |
| } while (md_super_wait(rdev->mddev) < 0); |
| return num_sectors; |
| } |
| |
| static int |
| super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) |
| { |
| /* non-zero offset changes not possible with v0.90 */ |
| return new_offset == 0; |
| } |
| |
| /* |
| * version 1 superblock |
| */ |
| |
| static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb) |
| { |
| __le32 disk_csum; |
| u32 csum; |
| unsigned long long newcsum; |
| int size = 256 + le32_to_cpu(sb->max_dev)*2; |
| __le32 *isuper = (__le32*)sb; |
| |
| disk_csum = sb->sb_csum; |
| sb->sb_csum = 0; |
| newcsum = 0; |
| for (; size >= 4; size -= 4) |
| newcsum += le32_to_cpu(*isuper++); |
| |
| if (size == 2) |
| newcsum += le16_to_cpu(*(__le16*) isuper); |
| |
| csum = (newcsum & 0xffffffff) + (newcsum >> 32); |
| sb->sb_csum = disk_csum; |
| return cpu_to_le32(csum); |
| } |
| |
| static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) |
| { |
| struct mdp_superblock_1 *sb; |
| int ret; |
| sector_t sb_start; |
| sector_t sectors; |
| char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; |
| int bmask; |
| |
| /* |
| * Calculate the position of the superblock in 512byte sectors. |
| * It is always aligned to a 4K boundary and |
| * depeding on minor_version, it can be: |
| * 0: At least 8K, but less than 12K, from end of device |
| * 1: At start of device |
| * 2: 4K from start of device. |
| */ |
| switch(minor_version) { |
| case 0: |
| sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; |
| sb_start -= 8*2; |
| sb_start &= ~(sector_t)(4*2-1); |
| break; |
| case 1: |
| sb_start = 0; |
| break; |
| case 2: |
| sb_start = 8; |
| break; |
| default: |
| return -EINVAL; |
| } |
| rdev->sb_start = sb_start; |
| |
| /* superblock is rarely larger than 1K, but it can be larger, |
| * and it is safe to read 4k, so we do that |
| */ |
| ret = read_disk_sb(rdev, 4096); |
| if (ret) return ret; |
| |
| sb = page_address(rdev->sb_page); |
| |
| if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
| sb->major_version != cpu_to_le32(1) || |
| le32_to_cpu(sb->max_dev) > (4096-256)/2 || |
| le64_to_cpu(sb->super_offset) != rdev->sb_start || |
| (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) |
| return -EINVAL; |
| |
| if (calc_sb_1_csum(sb) != sb->sb_csum) { |
| pr_warn("md: invalid superblock checksum on %s\n", |
| bdevname(rdev->bdev,b)); |
| return -EINVAL; |
| } |
| if (le64_to_cpu(sb->data_size) < 10) { |
| pr_warn("md: data_size too small on %s\n", |
| bdevname(rdev->bdev,b)); |
| return -EINVAL; |
| } |
| if (sb->pad0 || |
| sb->pad3[0] || |
| memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) |
| /* Some padding is non-zero, might be a new feature */ |
| return -EINVAL; |
| |
| rdev->preferred_minor = 0xffff; |
| rdev->data_offset = le64_to_cpu(sb->data_offset); |
| rdev->new_data_offset = rdev->data_offset; |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && |
| (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) |
| rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); |
| atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); |
| |
| rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; |
| bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; |
| if (rdev->sb_size & bmask) |
| rdev->sb_size = (rdev->sb_size | bmask) + 1; |
| |
| if (minor_version |
| && rdev->data_offset < sb_start + (rdev->sb_size/512)) |
| return -EINVAL; |
| if (minor_version |
| && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) |
| return -EINVAL; |
| |
| if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) |
| rdev->desc_nr = -1; |
| else |
| rdev->desc_nr = le32_to_cpu(sb->dev_number); |
| |
| if (!rdev->bb_page) { |
| rdev->bb_page = alloc_page(GFP_KERNEL); |
| if (!rdev->bb_page) |
| return -ENOMEM; |
| } |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && |
| rdev->badblocks.count == 0) { |
| /* need to load the bad block list. |
| * Currently we limit it to one page. |
| */ |
| s32 offset; |
| sector_t bb_sector; |
| u64 *bbp; |
| int i; |
| int sectors = le16_to_cpu(sb->bblog_size); |
| if (sectors > (PAGE_SIZE / 512)) |
| return -EINVAL; |
| offset = le32_to_cpu(sb->bblog_offset); |
| if (offset == 0) |
| return -EINVAL; |
| bb_sector = (long long)offset; |
| if (!sync_page_io(rdev, bb_sector, sectors << 9, |
| rdev->bb_page, REQ_OP_READ, 0, true)) |
| return -EIO; |
| bbp = (u64 *)page_address(rdev->bb_page); |
| rdev->badblocks.shift = sb->bblog_shift; |
| for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { |
| u64 bb = le64_to_cpu(*bbp); |
| int count = bb & (0x3ff); |
| u64 sector = bb >> 10; |
| sector <<= sb->bblog_shift; |
| count <<= sb->bblog_shift; |
| if (bb + 1 == 0) |
| break; |
| if (badblocks_set(&rdev->badblocks, sector, count, 1)) |
| return -EINVAL; |
| } |
| } else if (sb->bblog_offset != 0) |
| rdev->badblocks.shift = 0; |
| |
| if ((le32_to_cpu(sb->feature_map) & |
| (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) { |
| rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset); |
| rdev->ppl.size = le16_to_cpu(sb->ppl.size); |
| rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset; |
| } |
| |
| if (!refdev) { |
| ret = 1; |
| } else { |
| __u64 ev1, ev2; |
| struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); |
| |
| if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || |
| sb->level != refsb->level || |
| sb->layout != refsb->layout || |
| sb->chunksize != refsb->chunksize) { |
| pr_warn("md: %s has strangely different superblock to %s\n", |
| bdevname(rdev->bdev,b), |
| bdevname(refdev->bdev,b2)); |
| return -EINVAL; |
| } |
| ev1 = le64_to_cpu(sb->events); |
| ev2 = le64_to_cpu(refsb->events); |
| |
| if (ev1 > ev2) |
| ret = 1; |
| else |
| ret = 0; |
| } |
| if (minor_version) { |
| sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); |
| sectors -= rdev->data_offset; |
| } else |
| sectors = rdev->sb_start; |
| if (sectors < le64_to_cpu(sb->data_size)) |
| return -EINVAL; |
| rdev->sectors = le64_to_cpu(sb->data_size); |
| return ret; |
| } |
| |
| static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
| __u64 ev1 = le64_to_cpu(sb->events); |
| |
| rdev->raid_disk = -1; |
| clear_bit(Faulty, &rdev->flags); |
| clear_bit(In_sync, &rdev->flags); |
| clear_bit(Bitmap_sync, &rdev->flags); |
| clear_bit(WriteMostly, &rdev->flags); |
| |
| if (mddev->raid_disks == 0) { |
| mddev->major_version = 1; |
| mddev->patch_version = 0; |
| mddev->external = 0; |
| mddev->chunk_sectors = le32_to_cpu(sb->chunksize); |
| mddev->ctime = le64_to_cpu(sb->ctime); |
| mddev->utime = le64_to_cpu(sb->utime); |
| mddev->level = le32_to_cpu(sb->level); |
| mddev->clevel[0] = 0; |
| mddev->layout = le32_to_cpu(sb->layout); |
| mddev->raid_disks = le32_to_cpu(sb->raid_disks); |
| mddev->dev_sectors = le64_to_cpu(sb->size); |
| mddev->events = ev1; |
| mddev->bitmap_info.offset = 0; |
| mddev->bitmap_info.space = 0; |
| /* Default location for bitmap is 1K after superblock |
| * using 3K - total of 4K |
| */ |
| mddev->bitmap_info.default_offset = 1024 >> 9; |
| mddev->bitmap_info.default_space = (4096-1024) >> 9; |
| mddev->reshape_backwards = 0; |
| |
| mddev->recovery_cp = le64_to_cpu(sb->resync_offset); |
| memcpy(mddev->uuid, sb->set_uuid, 16); |
| |
| mddev->max_disks = (4096-256)/2; |
| |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && |
| mddev->bitmap_info.file == NULL) { |
| mddev->bitmap_info.offset = |
| (__s32)le32_to_cpu(sb->bitmap_offset); |
| /* Metadata doesn't record how much space is available. |
| * For 1.0, we assume we can use up to the superblock |
| * if before, else to 4K beyond superblock. |
| * For others, assume no change is possible. |
| */ |
| if (mddev->minor_version > 0) |
| mddev->bitmap_info.space = 0; |
| else if (mddev->bitmap_info.offset > 0) |
| mddev->bitmap_info.space = |
| 8 - mddev->bitmap_info.offset; |
| else |
| mddev->bitmap_info.space = |
| -mddev->bitmap_info.offset; |
| } |
| |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { |
| mddev->reshape_position = le64_to_cpu(sb->reshape_position); |
| mddev->delta_disks = le32_to_cpu(sb->delta_disks); |
| mddev->new_level = le32_to_cpu(sb->new_level); |
| mddev->new_layout = le32_to_cpu(sb->new_layout); |
| mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); |
| if (mddev->delta_disks < 0 || |
| (mddev->delta_disks == 0 && |
| (le32_to_cpu(sb->feature_map) |
| & MD_FEATURE_RESHAPE_BACKWARDS))) |
| mddev->reshape_backwards = 1; |
| } else { |
| mddev->reshape_position = MaxSector; |
| mddev->delta_disks = 0; |
| mddev->new_level = mddev->level; |
| mddev->new_layout = mddev->layout; |
| mddev->new_chunk_sectors = mddev->chunk_sectors; |
| } |
| |
| if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL) |
| set_bit(MD_HAS_JOURNAL, &mddev->flags); |
| |
| if (le32_to_cpu(sb->feature_map) & |
| (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) { |
| if (le32_to_cpu(sb->feature_map) & |
| (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL)) |
| return -EINVAL; |
| if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) && |
| (le32_to_cpu(sb->feature_map) & |
| MD_FEATURE_MULTIPLE_PPLS)) |
| return -EINVAL; |
| set_bit(MD_HAS_PPL, &mddev->flags); |
| } |
| } else if (mddev->pers == NULL) { |
| /* Insist of good event counter while assembling, except for |
| * spares (which don't need an event count) */ |
| ++ev1; |
| if (rdev->desc_nr >= 0 && |
| rdev->desc_nr < le32_to_cpu(sb->max_dev) && |
| (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX || |
| le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)) |
| if (ev1 < mddev->events) |
| return -EINVAL; |
| } else if (mddev->bitmap) { |
| /* If adding to array with a bitmap, then we can accept an |
| * older device, but not too old. |
| */ |
| if (ev1 < mddev->bitmap->events_cleared) |
| return 0; |
| if (ev1 < mddev->events) |
| set_bit(Bitmap_sync, &rdev->flags); |
| } else { |
| if (ev1 < mddev->events) |
| /* just a hot-add of a new device, leave raid_disk at -1 */ |
| return 0; |
| } |
| if (mddev->level != LEVEL_MULTIPATH) { |
| int role; |
| if (rdev->desc_nr < 0 || |
| rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { |
| role = MD_DISK_ROLE_SPARE; |
| rdev->desc_nr = -1; |
| } else |
| role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); |
| switch(role) { |
| case MD_DISK_ROLE_SPARE: /* spare */ |
| break; |
| case MD_DISK_ROLE_FAULTY: /* faulty */ |
| set_bit(Faulty, &rdev->flags); |
| break; |
| case MD_DISK_ROLE_JOURNAL: /* journal device */ |
| if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) { |
| /* journal device without journal feature */ |
| pr_warn("md: journal device provided without journal feature, ignoring the device\n"); |
| return -EINVAL; |
| } |
| set_bit(Journal, &rdev->flags); |
| rdev->journal_tail = le64_to_cpu(sb->journal_tail); |
| rdev->raid_disk = 0; |
| break; |
| default: |
| rdev->saved_raid_disk = role; |
| if ((le32_to_cpu(sb->feature_map) & |
| MD_FEATURE_RECOVERY_OFFSET)) { |
| rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); |
| if (!(le32_to_cpu(sb->feature_map) & |
| MD_FEATURE_RECOVERY_BITMAP)) |
| rdev->saved_raid_disk = -1; |
| } else { |
| /* |
| * If the array is FROZEN, then the device can't |
| * be in_sync with rest of array. |
| */ |
| if (!test_bit(MD_RECOVERY_FROZEN, |
| &mddev->recovery)) |
| set_bit(In_sync, &rdev->flags); |
| } |
| rdev->raid_disk = role; |
| break; |
| } |
| if (sb->devflags & WriteMostly1) |
| set_bit(WriteMostly, &rdev->flags); |
| if (sb->devflags & FailFast1) |
| set_bit(FailFast, &rdev->flags); |
| if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) |
| set_bit(Replacement, &rdev->flags); |
| } else /* MULTIPATH are always insync */ |
| set_bit(In_sync, &rdev->flags); |
| |
| return 0; |
| } |
| |
| static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| struct mdp_superblock_1 *sb; |
| struct md_rdev *rdev2; |
| int max_dev, i; |
| /* make rdev->sb match mddev and rdev data. */ |
| |
| sb = page_address(rdev->sb_page); |
| |
| sb->feature_map = 0; |
| sb->pad0 = 0; |
| sb->recovery_offset = cpu_to_le64(0); |
| memset(sb->pad3, 0, sizeof(sb->pad3)); |
| |
| sb->utime = cpu_to_le64((__u64)mddev->utime); |
| sb->events = cpu_to_le64(mddev->events); |
| if (mddev->in_sync) |
| sb->resync_offset = cpu_to_le64(mddev->recovery_cp); |
| else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags)) |
| sb->resync_offset = cpu_to_le64(MaxSector); |
| else |
| sb->resync_offset = cpu_to_le64(0); |
| |
| sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); |
| |
| sb->raid_disks = cpu_to_le32(mddev->raid_disks); |
| sb->size = cpu_to_le64(mddev->dev_sectors); |
| sb->chunksize = cpu_to_le32(mddev->chunk_sectors); |
| sb->level = cpu_to_le32(mddev->level); |
| sb->layout = cpu_to_le32(mddev->layout); |
| if (test_bit(FailFast, &rdev->flags)) |
| sb->devflags |= FailFast1; |
| else |
| sb->devflags &= ~FailFast1; |
| |
| if (test_bit(WriteMostly, &rdev->flags)) |
| sb->devflags |= WriteMostly1; |
| else |
| sb->devflags &= ~WriteMostly1; |
| sb->data_offset = cpu_to_le64(rdev->data_offset); |
| sb->data_size = cpu_to_le64(rdev->sectors); |
| |
| if (mddev->bitmap && mddev->bitmap_info.file == NULL) { |
| sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); |
| sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); |
| } |
| |
| if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) && |
| !test_bit(In_sync, &rdev->flags)) { |
| sb->feature_map |= |
| cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); |
| sb->recovery_offset = |
| cpu_to_le64(rdev->recovery_offset); |
| if (rdev->saved_raid_disk >= 0 && mddev->bitmap) |
| sb->feature_map |= |
| cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); |
| } |
| /* Note: recovery_offset and journal_tail share space */ |
| if (test_bit(Journal, &rdev->flags)) |
| sb->journal_tail = cpu_to_le64(rdev->journal_tail); |
| if (test_bit(Replacement, &rdev->flags)) |
| sb->feature_map |= |
| cpu_to_le32(MD_FEATURE_REPLACEMENT); |
| |
| if (mddev->reshape_position != MaxSector) { |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); |
| sb->reshape_position = cpu_to_le64(mddev->reshape_position); |
| sb->new_layout = cpu_to_le32(mddev->new_layout); |
| sb->delta_disks = cpu_to_le32(mddev->delta_disks); |
| sb->new_level = cpu_to_le32(mddev->new_level); |
| sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
| if (mddev->delta_disks == 0 && |
| mddev->reshape_backwards) |
| sb->feature_map |
| |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); |
| if (rdev->new_data_offset != rdev->data_offset) { |
| sb->feature_map |
| |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); |
| sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset |
| - rdev->data_offset)); |
| } |
| } |
| |
| if (mddev_is_clustered(mddev)) |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED); |
| |
| if (rdev->badblocks.count == 0) |
| /* Nothing to do for bad blocks*/ ; |
| else if (sb->bblog_offset == 0) |
| /* Cannot record bad blocks on this device */ |
| md_error(mddev, rdev); |
| else { |
| struct badblocks *bb = &rdev->badblocks; |
| u64 *bbp = (u64 *)page_address(rdev->bb_page); |
| u64 *p = bb->page; |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); |
| if (bb->changed) { |
| unsigned seq; |
| |
| retry: |
| seq = read_seqbegin(&bb->lock); |
| |
| memset(bbp, 0xff, PAGE_SIZE); |
| |
| for (i = 0 ; i < bb->count ; i++) { |
| u64 internal_bb = p[i]; |
| u64 store_bb = ((BB_OFFSET(internal_bb) << 10) |
| | BB_LEN(internal_bb)); |
| bbp[i] = cpu_to_le64(store_bb); |
| } |
| bb->changed = 0; |
| if (read_seqretry(&bb->lock, seq)) |
| goto retry; |
| |
| bb->sector = (rdev->sb_start + |
| (int)le32_to_cpu(sb->bblog_offset)); |
| bb->size = le16_to_cpu(sb->bblog_size); |
| } |
| } |
| |
| max_dev = 0; |
| rdev_for_each(rdev2, mddev) |
| if (rdev2->desc_nr+1 > max_dev) |
| max_dev = rdev2->desc_nr+1; |
| |
| if (max_dev > le32_to_cpu(sb->max_dev)) { |
| int bmask; |
| sb->max_dev = cpu_to_le32(max_dev); |
| rdev->sb_size = max_dev * 2 + 256; |
| bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; |
| if (rdev->sb_size & bmask) |
| rdev->sb_size = (rdev->sb_size | bmask) + 1; |
| } else |
| max_dev = le32_to_cpu(sb->max_dev); |
| |
| for (i=0; i<max_dev;i++) |
| sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); |
| |
| if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL); |
| |
| if (test_bit(MD_HAS_PPL, &mddev->flags)) { |
| if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags)) |
| sb->feature_map |= |
| cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS); |
| else |
| sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL); |
| sb->ppl.offset = cpu_to_le16(rdev->ppl.offset); |
| sb->ppl.size = cpu_to_le16(rdev->ppl.size); |
| } |
| |
| rdev_for_each(rdev2, mddev) { |
| i = rdev2->desc_nr; |
| if (test_bit(Faulty, &rdev2->flags)) |
| sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY); |
| else if (test_bit(In_sync, &rdev2->flags)) |
| sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
| else if (test_bit(Journal, &rdev2->flags)) |
| sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL); |
| else if (rdev2->raid_disk >= 0) |
| sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); |
| else |
| sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE); |
| } |
| |
| sb->sb_csum = calc_sb_1_csum(sb); |
| } |
| |
| static unsigned long long |
| super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) |
| { |
| struct mdp_superblock_1 *sb; |
| sector_t max_sectors; |
| if (num_sectors && num_sectors < rdev->mddev->dev_sectors) |
| return 0; /* component must fit device */ |
| if (rdev->data_offset != rdev->new_data_offset) |
| return 0; /* too confusing */ |
| if (rdev->sb_start < rdev->data_offset) { |
| /* minor versions 1 and 2; superblock before data */ |
| max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; |
| max_sectors -= rdev->data_offset; |
| if (!num_sectors || num_sectors > max_sectors) |
| num_sectors = max_sectors; |
| } else if (rdev->mddev->bitmap_info.offset) { |
| /* minor version 0 with bitmap we can't move */ |
| return 0; |
| } else { |
| /* minor version 0; superblock after data */ |
| sector_t sb_start; |
| sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; |
| sb_start &= ~(sector_t)(4*2 - 1); |
| max_sectors = rdev->sectors + sb_start - rdev->sb_start; |
| if (!num_sectors || num_sectors > max_sectors) |
| num_sectors = max_sectors; |
| rdev->sb_start = sb_start; |
| } |
| sb = page_address(rdev->sb_page); |
| sb->data_size = cpu_to_le64(num_sectors); |
| sb->super_offset = cpu_to_le64(rdev->sb_start); |
| sb->sb_csum = calc_sb_1_csum(sb); |
| do { |
| md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, |
| rdev->sb_page); |
| } while (md_super_wait(rdev->mddev) < 0); |
| return num_sectors; |
| |
| } |
| |
| static int |
| super_1_allow_new_offset(struct md_rdev *rdev, |
| unsigned long long new_offset) |
| { |
| /* All necessary checks on new >= old have been done */ |
| struct bitmap *bitmap; |
| if (new_offset >= rdev->data_offset) |
| return 1; |
| |
| /* with 1.0 metadata, there is no metadata to tread on |
| * so we can always move back */ |
| if (rdev->mddev->minor_version == 0) |
| return 1; |
| |
| /* otherwise we must be sure not to step on |
| * any metadata, so stay: |
| * 36K beyond start of superblock |
| * beyond end of badblocks |
| * beyond write-intent bitmap |
| */ |
| if (rdev->sb_start + (32+4)*2 > new_offset) |
| return 0; |
| bitmap = rdev->mddev->bitmap; |
| if (bitmap && !rdev->mddev->bitmap_info.file && |
| rdev->sb_start + rdev->mddev->bitmap_info.offset + |
| bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) |
| return 0; |
| if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) |
| return 0; |
| |
| return 1; |
| } |
| |
| static struct super_type super_types[] = { |
| [0] = { |
| .name = "0.90.0", |
| .owner = THIS_MODULE, |
| .load_super = super_90_load, |
| .validate_super = super_90_validate, |
| .sync_super = super_90_sync, |
| .rdev_size_change = super_90_rdev_size_change, |
| .allow_new_offset = super_90_allow_new_offset, |
| }, |
| [1] = { |
| .name = "md-1", |
| .owner = THIS_MODULE, |
| .load_super = super_1_load, |
| .validate_super = super_1_validate, |
| .sync_super = super_1_sync, |
| .rdev_size_change = super_1_rdev_size_change, |
| .allow_new_offset = super_1_allow_new_offset, |
| }, |
| }; |
| |
| static void sync_super(struct mddev *mddev, struct md_rdev *rdev) |
| { |
| if (mddev->sync_super) { |
| mddev->sync_super(mddev, rdev); |
| return; |
| } |
| |
| BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); |
| |
| super_types[mddev->major_version].sync_super(mddev, rdev); |
| } |
| |
| static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) |
| { |
| struct md_rdev *rdev, *rdev2; |
| |
| rcu_read_lock(); |
| rdev_for_each_rcu(rdev, mddev1) { |
| if (test_bit(Faulty, &rdev->flags) || |
| test_bit(Journal, &rdev->flags) || |
| rdev->raid_disk == -1) |
| continue; |
| rdev_for_each_rcu(rdev2, mddev2) { |
| if (test_bit(Faulty, &rdev2->flags) || |
| test_bit(Journal, &rdev2->flags) || |
| rdev2->raid_disk == -1) |
| continue; |
| if (rdev->bdev->bd_contains == |
| rdev2->bdev->bd_contains) { |
| rcu_read_unlock(); |
| return 1; |
| } |
| } |
| } |
| rcu_read_unlock(); |
| return 0; |
| } |
| |
| static LIST_HEAD(pending_raid_disks); |
| |
| /* |
| * Try to register data integrity profile for an mddev |
| * |
| * This is called when an array is started and after a disk has been kicked |
| * from the array. It only succeeds if all working and active component devices |
| * are integrity capable with matching profiles. |
| */ |
| int md_integrity_register(struct mddev *mddev) |
| { |
| struct md_rdev *rdev, *reference = NULL; |
| |
| if (list_empty(&mddev->disks)) |
| return 0; /* nothing to do */ |
| if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) |
| return 0; /* shouldn't register, or already is */ |
| rdev_for_each(rdev, mddev) { |
| /* skip spares and non-functional disks */ |
| if (test_bit(Faulty, &rdev->flags)) |
| continue; |
| if (rdev->raid_disk < 0) |
| continue; |
| if (!reference) { |
| /* Use the first rdev as the reference */ |
| reference = rdev; |
| continue; |
| } |
| /* does this rdev's profile match the reference profile? */ |
| if (blk_integrity_compare(reference->bdev->bd_disk, |
| rdev->bdev->bd_disk) < 0) |
| return -EINVAL; |
| } |
| if (!reference || !bdev_get_integrity(reference->bdev)) |
| return 0; |
| /* |
| * All component devices are integrity capable and have matching |
| * profiles, register the common profile for the md device. |
| */ |
| blk_integrity_register(mddev->gendisk, |
| bdev_get_integrity(reference->bdev)); |
| |
| pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); |
| if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) { |
| pr_err("md: failed to create integrity pool for %s\n", |
| mdname(mddev)); |
| return -EINVAL; |
| } |
| return 0; |
| } |
| EXPORT_SYMBOL(md_integrity_register); |
| |
| /* |
| * Attempt to add an rdev, but only if it is consistent with the current |
| * integrity profile |
| */ |
| int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) |
| { |
| struct blk_integrity *bi_rdev; |
| struct blk_integrity *bi_mddev; |
| char name[BDEVNAME_SIZE]; |
| |
| if (!mddev->gendisk) |
| return 0; |
| |
| bi_rdev = bdev_get_integrity(rdev->bdev); |
| bi_mddev = blk_get_integrity(mddev->gendisk); |
| |
| if (!bi_mddev) /* nothing to do */ |
| return 0; |
| |
| if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) { |
| pr_err("%s: incompatible integrity profile for %s\n", |
| mdname(mddev), bdevname(rdev->bdev, name)); |
| return -ENXIO; |
| } |
| |
| return 0; |
| } |
| EXPORT_SYMBOL(md_integrity_add_rdev); |
| |
| static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) |
| { |
| char b[BDEVNAME_SIZE]; |
| struct kobject *ko; |
| int err; |
| |
| /* prevent duplicates */ |
| if (find_rdev(mddev, rdev->bdev->bd_dev)) |
| return -EEXIST; |
| |
| if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) && |
| mddev->pers) |
| return -EROFS; |
| |
| /* make sure rdev->sectors exceeds mddev->dev_sectors */ |
| if (!test_bit(Journal, &rdev->flags) && |
| rdev->sectors && |
| (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) { |
| if (mddev->pers) { |
| /* Cannot change size, so fail |
| * If mddev->level <= 0, then we don't care |
| * about aligning sizes (e.g. linear) |
| */ |
| if (mddev->level > 0) |
| return -ENOSPC; |
| } else |
| mddev->dev_sectors = rdev->sectors; |
| } |
| |
| /* Verify rdev->desc_nr is unique. |
| * If it is -1, assign a free number, else |
| * check number is not in use |
| */ |
| rcu_read_lock(); |
| if (rdev->desc_nr < 0) { |
| int choice = 0; |
| if (mddev->pers) |
| choice = mddev->raid_disks; |
| while (md_find_rdev_nr_rcu(mddev, choice)) |
| choice++; |
| rdev->desc_nr = choice; |
| } else { |
| if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) { |
| rcu_read_unlock(); |
| return -EBUSY; |
| } |
| } |
| rcu_read_unlock(); |
| if (!test_bit(Journal, &rdev->flags) && |
| mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { |
| pr_warn("md: %s: array is limited to %d devices\n", |
| mdname(mddev), mddev->max_disks); |
| return -EBUSY; |
| } |
| bdevname(rdev->bdev,b); |
| strreplace(b, '/', '!'); |
| |
| rdev->mddev = mddev; |
| pr_debug("md: bind<%s>\n", b); |
| |
| if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) |
| goto fail; |
| |
| ko = &part_to_dev(rdev->bdev->bd_part)->kobj; |
| if (sysfs_create_link(&rdev->kobj, ko, "block")) |
| /* failure here is OK */; |
| rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); |
| |
| list_add_rcu(&rdev->same_set, &mddev->disks); |
| bd_link_disk_holder(rdev->bdev, mddev->gendisk); |
| |
| /* May as well allow recovery to be retried once */ |
| mddev->recovery_disabled++; |
| |
| return 0; |
| |
| fail: |
| pr_warn("md: failed to register dev-%s for %s\n", |
| b, mdname(mddev)); |
| return err; |
| } |
| |
| static void md_delayed_delete(struct work_struct *ws) |
| { |
| struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); |
| kobject_del(&rdev->kobj); |
| kobject_put(&rdev->kobj); |
| } |
| |
| static void unbind_rdev_from_array(struct md_rdev *rdev) |
| { |
| char b[BDEVNAME_SIZE]; |
| |
| bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); |
| list_del_rcu(&rdev->same_set); |
| pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b)); |
| rdev->mddev = NULL; |
| sysfs_remove_link(&rdev->kobj, "block"); |
| sysfs_put(rdev->sysfs_state); |
| rdev->sysfs_state = NULL; |
| rdev->badblocks.count = 0; |
| /* We need to delay this, otherwise we can deadlock when |
| * writing to 'remove' to "dev/state". We also need |
| * to delay it due to rcu usage. |
| */ |
| synchronize_rcu(); |
| INIT_WORK(&rdev->del_work, md_delayed_delete); |
| kobject_get(&rdev->kobj); |
| queue_work(md_misc_wq, &rdev->del_work); |
| } |
| |
| /* |
| * prevent the device from being mounted, repartitioned or |
| * otherwise reused by a RAID array (or any other kernel |
| * subsystem), by bd_claiming the device. |
| */ |
| static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) |
| { |
| int err = 0; |
| struct block_device *bdev; |
| char b[BDEVNAME_SIZE]; |
| |
| bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, |
| shared ? (struct md_rdev *)lock_rdev : rdev); |
| if (IS_ERR(bdev)) { |
| pr_warn("md: could not open %s.\n", __bdevname(dev, b)); |
| return PTR_ERR(bdev); |
| } |
| rdev->bdev = bdev; |
| return err; |
| } |
| |
| static void unlock_rdev(struct md_rdev *rdev) |
| { |
| struct block_device *bdev = rdev->bdev; |
| rdev->bdev = NULL; |
| blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); |
| } |
| |
| void md_autodetect_dev(dev_t dev); |
| |
| static void export_rdev(struct md_rdev *rdev) |
| { |
| char b[BDEVNAME_SIZE]; |
| |
| pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b)); |
| md_rdev_clear(rdev); |
| #ifndef MODULE |
| if (test_bit(AutoDetected, &rdev->flags)) |
| md_autodetect_dev(rdev->bdev->bd_dev); |
| #endif |
| unlock_rdev(rdev); |
| kobject_put(&rdev->kobj); |
| } |
| |
| void md_kick_rdev_from_array(struct md_rdev *rdev) |
| { |
| unbind_rdev_from_array(rdev); |
| export_rdev(rdev); |
| } |
| EXPORT_SYMBOL_GPL(md_kick_rdev_from_array); |
| |
| static void export_array(struct mddev *mddev) |
| { |
| struct md_rdev *rdev; |
| |
| while (!list_empty(&mddev->disks)) { |
| rdev = list_first_entry(&mddev->disks, struct md_rdev, |
| same_set); |
| md_kick_rdev_from_array(rdev); |
| } |
| mddev->raid_disks = 0; |
| mddev->major_version = 0; |
| } |
| |
| static bool set_in_sync(struct mddev *mddev) |
| { |
| lockdep_assert_held(&mddev->lock); |
| if (!mddev->in_sync) { |
| mddev->sync_checkers++; |
| spin_unlock(&mddev->lock); |
| percpu_ref_switch_to_atomic_sync(&mddev->writes_pending); |
| spin_lock(&mddev->lock); |
| if (!mddev->in_sync && |
| percpu_ref_is_zero(&mddev->writes_pending)) { |
| mddev->in_sync = 1; |
| /* |
| * Ensure ->in_sync is visible before we clear |
| * ->sync_checkers. |
| */ |
| smp_mb(); |
| set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| sysfs_notify_dirent_safe(mddev->sysfs_state); |
| } |
| if (--mddev->sync_checkers == 0) |
| percpu_ref_switch_to_percpu(&mddev->writes_pending); |
| } |
| if (mddev->safemode == 1) |
| mddev->safemode = 0; |
| return mddev->in_sync; |
| } |
| |
| static void sync_sbs(struct mddev *mddev, int nospares) |
| { |
| /* Update each superblock (in-memory image), but |
| * if we are allowed to, skip spares which already |
| * have the right event counter, or have one earlier |
| * (which would mean they aren't being marked as dirty |
| * with the rest of the array) |
| */ |
| struct md_rdev *rdev; |
| rdev_for_each(rdev, mddev) { |
| if (rdev->sb_events == mddev->events || |
| (nospares && |
| rdev->raid_disk < 0 && |
| rdev->sb_events+1 == mddev->events)) { |
| /* Don't update this superblock */ |
| rdev->sb_loaded = 2; |
| } else { |
| sync_super(mddev, rdev); |
| rdev->sb_loaded = 1; |
| } |
| } |
| } |
| |
| static bool does_sb_need_changing(struct mddev *mddev) |
| { |
| struct md_rdev *rdev; |
| struct mdp_superblock_1 *sb; |
| int role; |
| |
| /* Find a good rdev */ |
| rdev_for_each(rdev, mddev) |
| if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags)) |
| break; |
| |
| /* No good device found. */ |
| if (!rdev) |
| return false; |
| |
| sb = page_address(rdev->sb_page); |
| /* Check if a device has become faulty or a spare become active */ |
| rdev_for_each(rdev, mddev) { |
| role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); |
| /* Device activated? */ |
| if (role == 0xffff && rdev->raid_disk >=0 && |
| !test_bit(Faulty, &rdev->flags)) |
| return true; |
| /* Device turned faulty? */ |
| if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd)) |
| return true; |
| } |
| |
| /* Check if any mddev parameters have changed */ |
| if ((mddev->dev_sectors != le64_to_cpu(sb->size)) || |
| (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) || |
| (mddev->layout != le32_to_cpu(sb->layout)) || |
| (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) || |
| (mddev->chunk_sectors != le32_to_cpu(sb->chunksize))) |
| return true; |
| |
| return false; |
| } |
| |
| void md_update_sb(struct mddev *mddev, int force_change) |
| { |
| struct md_rdev *rdev; |
| int sync_req; |
| int nospares = 0; |
| int any_badblocks_changed = 0; |
| int ret = -1; |
| |
| if (mddev->ro) { |
| if (force_change) |
| set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| return; |
| } |
| |
| repeat: |
| if (mddev_is_clustered(mddev)) { |
| if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) |
| force_change = 1; |
| if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) |
| nospares = 1; |
| ret = md_cluster_ops->metadata_update_start(mddev); |
| /* Has someone else has updated the sb */ |
| if (!does_sb_need_changing(mddev)) { |
| if (ret == 0) |
| md_cluster_ops->metadata_update_cancel(mddev); |
| bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
| BIT(MD_SB_CHANGE_DEVS) | |
| BIT(MD_SB_CHANGE_CLEAN)); |
| return; |
| } |
| } |
| |
| /* |
| * First make sure individual recovery_offsets are correct |
| * curr_resync_completed can only be used during recovery. |
| * During reshape/resync it might use array-addresses rather |
| * that device addresses. |
| */ |
| rdev_for_each(rdev, mddev) { |
| if (rdev->raid_disk >= 0 && |
| mddev->delta_disks >= 0 && |
| test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && |
| test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && |
| !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && |
| !test_bit(Journal, &rdev->flags) && |
| !test_bit(In_sync, &rdev->flags) && |
| mddev->curr_resync_completed > rdev->recovery_offset) |
| rdev->recovery_offset = mddev->curr_resync_completed; |
| |
| } |
| if (!mddev->persistent) { |
| clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags); |
| clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| if (!mddev->external) { |
| clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags); |
| rdev_for_each(rdev, mddev) { |
| if (rdev->badblocks.changed) { |
| rdev->badblocks.changed = 0; |
| ack_all_badblocks(&rdev->badblocks); |
| md_error(mddev, rdev); |
| } |
| clear_bit(Blocked, &rdev->flags); |
| clear_bit(BlockedBadBlocks, &rdev->flags); |
| wake_up(&rdev->blocked_wait); |
| } |
| } |
| wake_up(&mddev->sb_wait); |
| return; |
| } |
| |
| spin_lock(&mddev->lock); |
| |
| mddev->utime = ktime_get_real_seconds(); |
| |
| if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) |
| force_change = 1; |
| if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags)) |
| /* just a clean<-> dirty transition, possibly leave spares alone, |
| * though if events isn't the right even/odd, we will have to do |
| * spares after all |
| */ |
| nospares = 1; |
| if (force_change) |
| nospares = 0; |
| if (mddev->degraded) |
| /* If the array is degraded, then skipping spares is both |
| * dangerous and fairly pointless. |
| * Dangerous because a device that was removed from the array |
| * might have a event_count that still looks up-to-date, |
| * so it can be re-added without a resync. |
| * Pointless because if there are any spares to skip, |
| * then a recovery will happen and soon that array won't |
| * be degraded any more and the spare can go back to sleep then. |
| */ |
| nospares = 0; |
| |
| sync_req = mddev->in_sync; |
| |
| /* If this is just a dirty<->clean transition, and the array is clean |
| * and 'events' is odd, we can roll back to the previous clean state */ |
| if (nospares |
| && (mddev->in_sync && mddev->recovery_cp == MaxSector) |
| && mddev->can_decrease_events |
| && mddev->events != 1) { |
| mddev->events--; |
| mddev->can_decrease_events = 0; |
| } else { |
| /* otherwise we have to go forward and ... */ |
| mddev->events ++; |
| mddev->can_decrease_events = nospares; |
| } |
| |
| /* |
| * This 64-bit counter should never wrap. |
| * Either we are in around ~1 trillion A.C., assuming |
| * 1 reboot per second, or we have a bug... |
| */ |
| WARN_ON(mddev->events == 0); |
| |
| rdev_for_each(rdev, mddev) { |
| if (rdev->badblocks.changed) |
| any_badblocks_changed++; |
| if (test_bit(Faulty, &rdev->flags)) |
| set_bit(FaultRecorded, &rdev->flags); |
| } |
| |
| sync_sbs(mddev, nospares); |
| spin_unlock(&mddev->lock); |
| |
| pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", |
| mdname(mddev), mddev->in_sync); |
| |
| if (mddev->queue) |
| blk_add_trace_msg(mddev->queue, "md md_update_sb"); |
| rewrite: |
| md_bitmap_update_sb(mddev->bitmap); |
| rdev_for_each(rdev, mddev) { |
| char b[BDEVNAME_SIZE]; |
| |
| if (rdev->sb_loaded != 1) |
| continue; /* no noise on spare devices */ |
| |
| if (!test_bit(Faulty, &rdev->flags)) { |
| md_super_write(mddev,rdev, |
| rdev->sb_start, rdev->sb_size, |
| rdev->sb_page); |
| pr_debug("md: (write) %s's sb offset: %llu\n", |
| bdevname(rdev->bdev, b), |
| (unsigned long long)rdev->sb_start); |
| rdev->sb_events = mddev->events; |
| if (rdev->badblocks.size) { |
| md_super_write(mddev, rdev, |
| rdev->badblocks.sector, |
| rdev->badblocks.size << 9, |
| rdev->bb_page); |
| rdev->badblocks.size = 0; |
| } |
| |
| } else |
| pr_debug("md: %s (skipping faulty)\n", |
| bdevname(rdev->bdev, b)); |
| |
| if (mddev->level == LEVEL_MULTIPATH) |
| /* only need to write one superblock... */ |
| break; |
| } |
| if (md_super_wait(mddev) < 0) |
| goto rewrite; |
| /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */ |
| |
| if (mddev_is_clustered(mddev) && ret == 0) |
| md_cluster_ops->metadata_update_finish(mddev); |
| |
| if (mddev->in_sync != sync_req || |
| !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING), |
| BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN))) |
| /* have to write it out again */ |
| goto repeat; |
| wake_up(&mddev->sb_wait); |
| if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
| sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
| |
| rdev_for_each(rdev, mddev) { |
| if (test_and_clear_bit(FaultRecorded, &rdev->flags)) |
| clear_bit(Blocked, &rdev->flags); |
| |
| if (any_badblocks_changed) |
| ack_all_badblocks(&rdev->badblocks); |
| clear_bit(BlockedBadBlocks, &rdev->flags); |
| wake_up(&rdev->blocked_wait); |
| } |
| } |
| EXPORT_SYMBOL(md_update_sb); |
| |
| static int add_bound_rdev(struct md_rdev *rdev) |
| { |
| struct mddev *mddev = rdev->mddev; |
| int err = 0; |
| bool add_journal = test_bit(Journal, &rdev->flags); |
| |
| if (!mddev->pers->hot_remove_disk || add_journal) { |
| /* If there is hot_add_disk but no hot_remove_disk |
| * then added disks for geometry changes, |
| * and should be added immediately. |
| */ |
| super_types[mddev->major_version]. |
| validate_super(mddev, rdev); |
| if (add_journal) |
| mddev_suspend(mddev); |
| err = mddev->pers->hot_add_disk(mddev, rdev); |
| if (add_journal) |
| mddev_resume(mddev); |
| if (err) { |
| md_kick_rdev_from_array(rdev); |
| return err; |
| } |
| } |
| sysfs_notify_dirent_safe(rdev->sysfs_state); |
| |
| set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| if (mddev->degraded) |
| set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); |
| set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
| md_new_event(mddev); |
| md_wakeup_thread(mddev->thread); |
| return 0; |
| } |
| |
| /* words written to sysfs files may, or may not, be \n terminated. |
| * We want to accept with case. For this we use cmd_match. |
| */ |
| static int cmd_match(const char *cmd, const char *str) |
| { |
| /* See if cmd, written into a sysfs file, matches |
| * str. They must either be the same, or cmd can |
| * have a trailing newline |
| */ |
| while (*cmd && *str && *cmd == *str) { |
| cmd++; |
| str++; |
| } |
| if (*cmd == '\n') |
| cmd++; |
| if (*str || *cmd) |
| return 0; |
| return 1; |
| } |
| |
| struct rdev_sysfs_entry { |
| struct attribute attr; |
| ssize_t (*show)(struct md_rdev *, char *); |
| ssize_t (*store)(struct md_rdev *, const char *, size_t); |
| }; |
| |
| static ssize_t |
| state_show(struct md_rdev *rdev, char *page) |
| { |
| char *sep = ","; |
| size_t len = 0; |
| unsigned long flags = READ_ONCE(rdev->flags); |
| |
| if (test_bit(Faulty, &flags) || |
| (!test_bit(ExternalBbl, &flags) && |
| rdev->badblocks.unacked_exist)) |
| len += sprintf(page+len, "faulty%s", sep); |
| if (test_bit(In_sync, &flags)) |
| len += sprintf(page+len, "in_sync%s", sep); |
| if (test_bit(Journal, &flags)) |
| len += sprintf(page+len, "journal%s", sep); |
| if (test_bit(WriteMostly, &flags)) |
| len += sprintf(page+len, "write_mostly%s", sep); |
| if (test_bit(Blocked, &flags) || |
| (rdev->badblocks.unacked_exist |
| && !test_bit(Faulty, &flags))) |
| len += sprintf(page+len, "blocked%s", sep); |
| if (!test_bit(Faulty, &flags) && |
| !test_bit(Journal, &flags) && |
| !test_bit(In_sync, &flags)) |
| len += sprintf(page+len, "spare%s", sep); |
| if (test_bit(WriteErrorSeen, &flags)) |
| len += sprintf(page+len, "write_error%s", sep); |
| if (test_bit(WantReplacement, &flags)) |
| len += sprintf(page+len, "want_replacement%s", sep); |
| if (test_bit(Replacement, &flags)) |
| len += sprintf(page+len, "replacement%s", sep); |
| if (test_bit(ExternalBbl, &flags)) |
| len += sprintf(page+len, "external_bbl%s", sep); |
| if (test_bit(FailFast, &flags)) |
| len += sprintf(page+len, "failfast%s", sep); |
| |
| if (len) |
| len -= strlen(sep); |
| |
| return len+sprintf(page+len, "\n"); |
| } |
| |
| static ssize_t |
| state_store(struct md_rdev *rdev, const char *buf, size_t len) |
| { |
| /* can write |
| * faulty - simulates an error |
| * remove - disconnects the device |
| * writemostly - sets write_mostly |
| * -writemostly - clears write_mostly |
| * blocked - sets the Blocked flags |
| * -blocked - clears the Blocked and possibly simulates an error |
| * insync - sets Insync providing device isn't active |
| * -insync - clear Insync for a device with a slot assigned, |
| * so that it gets rebuilt based on bitmap |
| * write_error - sets WriteErrorSeen |
| * -write_error - clears WriteErrorSeen |
| * {,-}failfast - set/clear FailFast |
| */ |
| int err = -EINVAL; |
| if (cmd_match(buf, "faulty") && rdev->mddev->pers) { |
| md_error(rdev->mddev, rdev); |
| if (test_bit(Faulty, &rdev->flags)) |
| err = 0; |
| else |
| err = -EBUSY; |
| } else if (cmd_match(buf, "remove")) { |
| if (rdev->mddev->pers) { |
| clear_bit(Blocked, &rdev->flags); |
| remove_and_add_spares(rdev->mddev, rdev); |
| } |
| if (rdev->raid_disk >= 0) |
| err = -EBUSY; |
| else { |
| struct mddev *mddev = rdev->mddev; |
| err = 0; |
| if (mddev_is_clustered(mddev)) |
| err = md_cluster_ops->remove_disk(mddev, rdev); |
| |
| if (err == 0) { |
| md_kick_rdev_from_array(rdev); |
| if (mddev->pers) { |
| set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); |
| md_wakeup_thread(mddev->thread); |
| } |
| md_new_event(mddev); |
| } |
| } |
| } else if (cmd_match(buf, "writemostly")) { |
| set_bit(WriteMostly, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "-writemostly")) { |
| clear_bit(WriteMostly, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "blocked")) { |
| set_bit(Blocked, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "-blocked")) { |
| if (!test_bit(Faulty, &rdev->flags) && |
| !test_bit(ExternalBbl, &rdev->flags) && |
| rdev->badblocks.unacked_exist) { |
| /* metadata handler doesn't understand badblocks, |
| * so we need to fail the device |
| */ |
| md_error(rdev->mddev, rdev); |
| } |
| clear_bit(Blocked, &rdev->flags); |
| clear_bit(BlockedBadBlocks, &rdev->flags); |
| wake_up(&rdev->blocked_wait); |
| set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
| md_wakeup_thread(rdev->mddev->thread); |
| |
| err = 0; |
| } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { |
| set_bit(In_sync, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "failfast")) { |
| set_bit(FailFast, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "-failfast")) { |
| clear_bit(FailFast, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 && |
| !test_bit(Journal, &rdev->flags)) { |
| if (rdev->mddev->pers == NULL) { |
| clear_bit(In_sync, &rdev->flags); |
| rdev->saved_raid_disk = rdev->raid_disk; |
| rdev->raid_disk = -1; |
| err = 0; |
| } |
| } else if (cmd_match(buf, "write_error")) { |
| set_bit(WriteErrorSeen, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "-write_error")) { |
| clear_bit(WriteErrorSeen, &rdev->flags); |
| err = 0; |
| } else if (cmd_match(buf, "want_replacement")) { |
| /* Any non-spare device that is not a replacement can |
| * become want_replacement at any time, but we then need to |
| * check if recovery is needed. |
| */ |
| if (rdev->raid_disk >= 0 && |
| !test_bit(Journal, &rdev->flags) && |
| !test_bit(Replacement, &rdev->flags)) |
| set_bit(WantReplacement, &rdev->flags); |
| set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
| md_wakeup_thread(rdev->mddev->thread); |
| err = 0; |
| } else if (cmd_match(buf, "-want_replacement")) { |
| /* Clearing 'want_replacement' is always allowed. |
| * Once replacements starts it is too late though. |
| */ |
| err = 0; |
| clear_bit(WantReplacement, &rdev->flags); |
| } else if (cmd_match(buf, "replacement")) { |
| /* Can only set a device as a replacement when array has not |
| * yet been started. Once running, replacement is automatic |
| * from spares, or by assigning 'slot'. |
| */ |
| if (rdev->mddev->pers) |
| err = -EBUSY; |
| else { |
| set_bit(Replacement, &rdev->flags); |
| err = 0; |
| } |
| } else if (cmd_match(buf, "-replacement")) { |
| /* Similarly, can only clear Replacement before start */ |
| if (rdev->mddev->pers) |
| err = -EBUSY; |
| else { |
| clear_bit(Replacement, &rdev->flags); |
| err = 0; |
| } |
| } else if (cmd_match(buf, "re-add")) { |
| if (!rdev->mddev->pers) |
| err = -EINVAL; |
|