You can do a very simple testing of running two dd threads in two different
cgroups. Here is what you can do.
+- Enable Block IO controller
+ CONFIG_BLK_CGROUP=y
+
- Enable group scheduling in CFQ
CONFIG_CFQ_GROUP_IOSCHED=y
Various user visible config options
===================================
-CONFIG_CFQ_GROUP_IOSCHED
- - Enables group scheduling in CFQ. Currently only 1 level of group
- creation is allowed.
-
-CONFIG_DEBUG_CFQ_IOSCHED
- - Enables some debugging messages in blktrace. Also creates extra
- cgroup file blkio.dequeue.
-
-Config options selected automatically
-=====================================
-These config options are not user visible and are selected/deselected
-automatically based on IO scheduler configuration.
-
CONFIG_BLK_CGROUP
- - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED.
+ - Block IO controller.
CONFIG_DEBUG_BLK_CGROUP
- - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED.
+ - Debug help. Right now some additional stats file show up in cgroup
+ if this option is enabled.
+
+CONFIG_CFQ_GROUP_IOSCHED
+ - Enables group scheduling in CFQ. Currently only 1 level of group
+ creation is allowed.
Details of cgroup files
=======================
- blkio.weight
- - Specifies per cgroup weight.
-
+ - Specifies per cgroup weight. This is default weight of the group
+ on all the devices until and unless overridden by per device rule.
+ (See blkio.weight_device).
Currently allowed range of weights is from 100 to 1000.
+- blkio.weight_device
+ - One can specify per cgroup per device rules using this interface.
+ These rules override the default value of group weight as specified
+ by blkio.weight.
+
+ Following is the format.
+
+ #echo dev_maj:dev_minor weight > /path/to/cgroup/blkio.weight_device
+ Configure weight=300 on /dev/sdb (8:16) in this cgroup
+ # echo 8:16 300 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:16 300
+
+ Configure weight=500 on /dev/sda (8:0) in this cgroup
+ # echo 8:0 500 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:0 500
+ 8:16 300
+
+ Remove specific weight for /dev/sda in this cgroup
+ # echo 8:0 0 > blkio.weight_device
+ # cat blkio.weight_device
+ dev weight
+ 8:16 300
+
- blkio.time
- disk time allocated to cgroup per device in milliseconds. First
two fields specify the major and minor number of the device and
third field specifies the number of sectors transferred by the
group to/from the device.
+- blkio.io_service_bytes
+ - Number of bytes transferred to/from the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of bytes.
+
+- blkio.io_serviced
+ - Number of IOs completed to/from the disk by the group. These
+ are further divided by the type of operation - read or write, sync
+ or async. First two fields specify the major and minor number of the
+ device, third field specifies the operation type and the fourth field
+ specifies the number of IOs.
+
+- blkio.io_service_time
+ - Total amount of time between request dispatch and request completion
+ for the IOs done by this cgroup. This is in nanoseconds to make it
+ meaningful for flash devices too. For devices with queue depth of 1,
+ this time represents the actual service time. When queue_depth > 1,
+ that is no longer true as requests may be served out of order. This
+ may cause the service time for a given IO to include the service time
+ of multiple IOs when served out of order which may result in total
+ io_service_time > actual time elapsed. This time is further divided by
+ the type of operation - read or write, sync or async. First two fields
+ specify the major and minor number of the device, third field
+ specifies the operation type and the fourth field specifies the
+ io_service_time in ns.
+
+- blkio.io_wait_time
+ - Total amount of time the IOs for this cgroup spent waiting in the
+ scheduler queues for service. This can be greater than the total time
+ elapsed since it is cumulative io_wait_time for all IOs. It is not a
+ measure of total time the cgroup spent waiting but rather a measure of
+ the wait_time for its individual IOs. For devices with queue_depth > 1
+ this metric does not include the time spent waiting for service once
+ the IO is dispatched to the device but till it actually gets serviced
+ (there might be a time lag here due to re-ordering of requests by the
+ device). This is in nanoseconds to make it meaningful for flash
+ devices too. This time is further divided by the type of operation -
+ read or write, sync or async. First two fields specify the major and
+ minor number of the device, third field specifies the operation type
+ and the fourth field specifies the io_wait_time in ns.
+
+- blkio.io_merged
+ - Total number of bios/requests merged into requests belonging to this
+ cgroup. This is further divided by the type of operation - read or
+ write, sync or async.
+
+- blkio.io_queued
+ - Total number of requests queued up at any given instant for this
+ cgroup. This is further divided by the type of operation - read or
+ write, sync or async.
+
+- blkio.avg_queue_size
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ The average queue size for this cgroup over the entire time of this
+ cgroup's existence. Queue size samples are taken each time one of the
+ queues of this cgroup gets a timeslice.
+
+- blkio.group_wait_time
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ This is the amount of time the cgroup had to wait since it became busy
+ (i.e., went from 0 to 1 request queued) to get a timeslice for one of
+ its queues. This is different from the io_wait_time which is the
+ cumulative total of the amount of time spent by each IO in that cgroup
+ waiting in the scheduler queue. This is in nanoseconds. If this is
+ read when the cgroup is in a waiting (for timeslice) state, the stat
+ will only report the group_wait_time accumulated till the last time it
+ got a timeslice and will not include the current delta.
+
+- blkio.empty_time
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ This is the amount of time a cgroup spends without any pending
+ requests when not being served, i.e., it does not include any time
+ spent idling for one of the queues of the cgroup. This is in
+ nanoseconds. If this is read when the cgroup is in an empty state,
+ the stat will only report the empty_time accumulated till the last
+ time it had a pending request and will not include the current delta.
+
+- blkio.idle_time
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y.
+ This is the amount of time spent by the IO scheduler idling for a
+ given cgroup in anticipation of a better request than the exising ones
+ from other queues/cgroups. This is in nanoseconds. If this is read
+ when the cgroup is in an idling state, the stat will only report the
+ idle_time accumulated till the last idle period and will not include
+ the current delta.
+
- blkio.dequeue
- - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This
+ - Debugging aid only enabled if CONFIG_DEBUG_BLK_CGROUP=y. This
gives the statistics about how many a times a group was dequeued
from service tree of the device. First two fields specify the major
and minor number of the device and third field specifies the number
of times a group was dequeued from a particular device.
+- blkio.reset_stats
+ - Writing an int to this file will result in resetting all the stats
+ for that cgroup.
+
CFQ sysfs tunable
=================
/sys/block/<disk>/queue/iosched/group_isolation
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
-config BLK_CGROUP
- tristate "Block cgroup support"
- depends on CGROUPS
- depends on CFQ_GROUP_IOSCHED
- default n
- ---help---
- Generic block IO controller cgroup interface. This is the common
- cgroup interface which should be used by various IO controlling
- policies.
-
- Currently, CFQ IO scheduler uses it to recognize task groups and
- control disk bandwidth allocation (proportional time slice allocation)
- to such task groups.
-
-config DEBUG_BLK_CGROUP
- bool
- depends on BLK_CGROUP
- default n
- ---help---
- Enable some debugging help. Currently it stores the cgroup path
- in the blk group which can be used by cfq for tracing various
- group related activity.
-
endif # BLOCK
config BLOCK_COMPAT
config IOSCHED_CFQ
tristate "CFQ I/O scheduler"
- select BLK_CGROUP if CFQ_GROUP_IOSCHED
+ # If BLK_CGROUP is a module, CFQ has to be built as module.
+ depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y
default y
---help---
The CFQ I/O scheduler tries to distribute bandwidth equally
This is the default I/O scheduler.
+ Note: If BLK_CGROUP=m, then CFQ can be built only as module.
+
config CFQ_GROUP_IOSCHED
bool "CFQ Group Scheduling support"
- depends on IOSCHED_CFQ && CGROUPS
+ depends on IOSCHED_CFQ && BLK_CGROUP
default n
---help---
Enable group IO scheduling in CFQ.
-config DEBUG_CFQ_IOSCHED
- bool "Debug CFQ Scheduling"
- depends on CFQ_GROUP_IOSCHED
- select DEBUG_BLK_CGROUP
- default n
- ---help---
- Enable CFQ IO scheduling debugging in CFQ. Currently it makes
- blktrace output more verbose.
-
choice
prompt "Default I/O scheduler"
default DEFAULT_CFQ
obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o
+ blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
clear_bit(BIO_UPTODATE, &bio->bi_flags);
}
-
- complete(bio->bi_private);
+ if (bio->bi_private)
+ complete(bio->bi_private);
+ bio_put(bio);
}
/**
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
+ * @gfp_mask: memory allocation flags (for bio_alloc)
* @error_sector: error sector
+ * @flags: BLKDEV_IFL_* flags to control behaviour
*
* Description:
* Issue a flush for the block device in question. Caller can supply
* room for storing the error offset in case of a flush error, if they
- * wish to.
+ * wish to. If WAIT flag is not passed then caller may check only what
+ * request was pushed in some internal queue for later handling.
*/
-int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
+ sector_t *error_sector, unsigned long flags)
{
DECLARE_COMPLETION_ONSTACK(wait);
struct request_queue *q;
struct bio *bio;
- int ret;
+ int ret = 0;
if (bdev->bd_disk == NULL)
return -ENXIO;
if (!q)
return -ENXIO;
- bio = bio_alloc(GFP_KERNEL, 0);
+ bio = bio_alloc(gfp_mask, 0);
bio->bi_end_io = bio_end_empty_barrier;
- bio->bi_private = &wait;
bio->bi_bdev = bdev;
- submit_bio(WRITE_BARRIER, bio);
-
- wait_for_completion(&wait);
+ if (test_bit(BLKDEV_WAIT, &flags))
+ bio->bi_private = &wait;
- /*
- * The driver must store the error location in ->bi_sector, if
- * it supports it. For non-stacked drivers, this should be copied
- * from blk_rq_pos(rq).
- */
- if (error_sector)
- *error_sector = bio->bi_sector;
+ bio_get(bio);
+ submit_bio(WRITE_BARRIER, bio);
+ if (test_bit(BLKDEV_WAIT, &flags)) {
+ wait_for_completion(&wait);
+ /*
+ * The driver must store the error location in ->bi_sector, if
+ * it supports it. For non-stacked drivers, this should be
+ * copied from blk_rq_pos(rq).
+ */
+ if (error_sector)
+ *error_sector = bio->bi_sector;
+ }
- ret = 0;
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
else if (!bio_flagged(bio, BIO_UPTODATE))
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
-
-static void blkdev_discard_end_io(struct bio *bio, int err)
-{
- if (err) {
- if (err == -EOPNOTSUPP)
- set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
- clear_bit(BIO_UPTODATE, &bio->bi_flags);
- }
-
- if (bio->bi_private)
- complete(bio->bi_private);
- __free_page(bio_page(bio));
-
- bio_put(bio);
-}
-
-/**
- * blkdev_issue_discard - queue a discard
- * @bdev: blockdev to issue discard for
- * @sector: start sector
- * @nr_sects: number of sectors to discard
- * @gfp_mask: memory allocation flags (for bio_alloc)
- * @flags: DISCARD_FL_* flags to control behaviour
- *
- * Description:
- * Issue a discard request for the sectors in question.
- */
-int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, int flags)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- struct request_queue *q = bdev_get_queue(bdev);
- int type = flags & DISCARD_FL_BARRIER ?
- DISCARD_BARRIER : DISCARD_NOBARRIER;
- struct bio *bio;
- struct page *page;
- int ret = 0;
-
- if (!q)
- return -ENXIO;
-
- if (!blk_queue_discard(q))
- return -EOPNOTSUPP;
-
- while (nr_sects && !ret) {
- unsigned int sector_size = q->limits.logical_block_size;
- unsigned int max_discard_sectors =
- min(q->limits.max_discard_sectors, UINT_MAX >> 9);
-
- bio = bio_alloc(gfp_mask, 1);
- if (!bio)
- goto out;
- bio->bi_sector = sector;
- bio->bi_end_io = blkdev_discard_end_io;
- bio->bi_bdev = bdev;
- if (flags & DISCARD_FL_WAIT)
- bio->bi_private = &wait;
-
- /*
- * Add a zeroed one-sector payload as that's what
- * our current implementations need. If we'll ever need
- * more the interface will need revisiting.
- */
- page = alloc_page(gfp_mask | __GFP_ZERO);
- if (!page)
- goto out_free_bio;
- if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
- goto out_free_page;
-
- /*
- * And override the bio size - the way discard works we
- * touch many more blocks on disk than the actual payload
- * length.
- */
- if (nr_sects > max_discard_sectors) {
- bio->bi_size = max_discard_sectors << 9;
- nr_sects -= max_discard_sectors;
- sector += max_discard_sectors;
- } else {
- bio->bi_size = nr_sects << 9;
- nr_sects = 0;
- }
-
- bio_get(bio);
- submit_bio(type, bio);
-
- if (flags & DISCARD_FL_WAIT)
- wait_for_completion(&wait);
-
- if (bio_flagged(bio, BIO_EOPNOTSUPP))
- ret = -EOPNOTSUPP;
- else if (!bio_flagged(bio, BIO_UPTODATE))
- ret = -EIO;
- bio_put(bio);
- }
- return ret;
-out_free_page:
- __free_page(page);
-out_free_bio:
- bio_put(bio);
-out:
- return -ENOMEM;
-}
-EXPORT_SYMBOL(blkdev_issue_discard);
#include <linux/kdev_t.h>
#include <linux/module.h>
#include <linux/err.h>
+#include <linux/blkdev.h>
#include <linux/slab.h>
#include "blk-cgroup.h"
+#include <linux/genhd.h>
+
+#define MAX_KEY_LEN 100
static DEFINE_SPINLOCK(blkio_list_lock);
static LIST_HEAD(blkio_list);
};
EXPORT_SYMBOL_GPL(blkio_subsys);
+static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
+ struct blkio_policy_node *pn)
+{
+ list_add(&pn->node, &blkcg->policy_list);
+}
+
+/* Must be called with blkcg->lock held */
+static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
+{
+ list_del(&pn->node);
+}
+
+/* Must be called with blkcg->lock held */
+static struct blkio_policy_node *
+blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
+{
+ struct blkio_policy_node *pn;
+
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ if (pn->dev == dev)
+ return pn;
+ }
+
+ return NULL;
+}
+
struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
{
return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
}
EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup);
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
- unsigned long time, unsigned long sectors)
+/*
+ * Add to the appropriate stat variable depending on the request type.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction,
+ bool sync)
+{
+ if (direction)
+ stat[BLKIO_STAT_WRITE] += add;
+ else
+ stat[BLKIO_STAT_READ] += add;
+ if (sync)
+ stat[BLKIO_STAT_SYNC] += add;
+ else
+ stat[BLKIO_STAT_ASYNC] += add;
+}
+
+/*
+ * Decrements the appropriate stat variable if non-zero depending on the
+ * request type. Panics on value being zero.
+ * This should be called with the blkg->stats_lock held.
+ */
+static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync)
+{
+ if (direction) {
+ BUG_ON(stat[BLKIO_STAT_WRITE] == 0);
+ stat[BLKIO_STAT_WRITE]--;
+ } else {
+ BUG_ON(stat[BLKIO_STAT_READ] == 0);
+ stat[BLKIO_STAT_READ]--;
+ }
+ if (sync) {
+ BUG_ON(stat[BLKIO_STAT_SYNC] == 0);
+ stat[BLKIO_STAT_SYNC]--;
+ } else {
+ BUG_ON(stat[BLKIO_STAT_ASYNC] == 0);
+ stat[BLKIO_STAT_ASYNC]--;
+ }
+}
+
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg)
+{
+ if (blkio_blkg_waiting(&blkg->stats))
+ return;
+ if (blkg == curr_blkg)
+ return;
+ blkg->stats.start_group_wait_time = sched_clock();
+ blkio_mark_blkg_waiting(&blkg->stats);
+}
+
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_update_group_wait_time(struct blkio_group_stats *stats)
+{
+ unsigned long long now;
+
+ if (!blkio_blkg_waiting(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_group_wait_time))
+ stats->group_wait_time += now - stats->start_group_wait_time;
+ blkio_clear_blkg_waiting(stats);
+}
+
+/* This should be called with the blkg->stats_lock held. */
+static void blkio_end_empty_time(struct blkio_group_stats *stats)
+{
+ unsigned long long now;
+
+ if (!blkio_blkg_empty(stats))
+ return;
+
+ now = sched_clock();
+ if (time_after64(now, stats->start_empty_time))
+ stats->empty_time += now - stats->start_empty_time;
+ blkio_clear_blkg_empty(stats);
+}
+
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ BUG_ON(blkio_blkg_idling(&blkg->stats));
+ blkg->stats.start_idle_time = sched_clock();
+ blkio_mark_blkg_idling(&blkg->stats);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats);
+
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg)
+{
+ unsigned long flags;
+ unsigned long long now;
+ struct blkio_group_stats *stats;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ if (blkio_blkg_idling(stats)) {
+ now = sched_clock();
+ if (time_after64(now, stats->start_idle_time))
+ stats->idle_time += now - stats->start_idle_time;
+ blkio_clear_blkg_idling(stats);
+ }
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats);
+
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg)
+{
+ unsigned long flags;
+ struct blkio_group_stats *stats;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ stats->avg_queue_size_sum +=
+ stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] +
+ stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE];
+ stats->avg_queue_size_samples++;
+ blkio_update_group_wait_time(stats);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats);
+
+void blkiocg_set_start_empty_time(struct blkio_group *blkg)
+{
+ unsigned long flags;
+ struct blkio_group_stats *stats;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+
+ if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] ||
+ stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) {
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+ return;
+ }
+
+ /*
+ * group is already marked empty. This can happen if cfqq got new
+ * request in parent group and moved to this group while being added
+ * to service tree. Just ignore the event and move on.
+ */
+ if(blkio_blkg_empty(stats)) {
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+ return;
+ }
+
+ stats->start_empty_time = sched_clock();
+ blkio_mark_blkg_empty(stats);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time);
+
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+ unsigned long dequeue)
+{
+ blkg->stats.dequeue += dequeue;
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats);
+#else
+static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg) {}
+static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {}
+#endif
+
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg, bool direction,
+ bool sync)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction,
+ sync);
+ blkio_end_empty_time(&blkg->stats);
+ blkio_set_start_group_wait_time(blkg, curr_blkg);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats);
+
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+ bool direction, bool sync)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED],
+ direction, sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats);
+
+void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkg->stats.time += time;
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used);
+
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+ uint64_t bytes, bool direction, bool sync)
{
- blkg->time += time;
- blkg->sectors += sectors;
+ struct blkio_group_stats *stats;
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ stats->sectors += bytes >> 9;
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICED], 1, direction,
+ sync);
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_BYTES], bytes,
+ direction, sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
}
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats);
+EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats);
+
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+ uint64_t start_time, uint64_t io_start_time, bool direction, bool sync)
+{
+ struct blkio_group_stats *stats;
+ unsigned long flags;
+ unsigned long long now = sched_clock();
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ stats = &blkg->stats;
+ if (time_after64(now, io_start_time))
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME],
+ now - io_start_time, direction, sync);
+ if (time_after64(io_start_time, start_time))
+ blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME],
+ io_start_time - start_time, direction, sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats);
+
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+ bool sync)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkg->stats_lock, flags);
+ blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction,
+ sync);
+ spin_unlock_irqrestore(&blkg->stats_lock, flags);
+}
+EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats);
void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
struct blkio_group *blkg, void *key, dev_t dev)
unsigned long flags;
spin_lock_irqsave(&blkcg->lock, flags);
+ spin_lock_init(&blkg->stats_lock);
rcu_assign_pointer(blkg->key, key);
blkg->blkcg_id = css_id(&blkcg->css);
hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
spin_unlock_irqrestore(&blkcg->lock, flags);
-#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Need to take css reference ? */
cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path));
-#endif
blkg->dev = dev;
}
EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group);
rcu_read_lock();
css = css_lookup(&blkio_subsys, blkg->blkcg_id);
- if (!css)
- goto out;
-
- blkcg = container_of(css, struct blkio_cgroup, css);
- spin_lock_irqsave(&blkcg->lock, flags);
- if (!hlist_unhashed(&blkg->blkcg_node)) {
- __blkiocg_del_blkio_group(blkg);
- ret = 0;
+ if (css) {
+ blkcg = container_of(css, struct blkio_cgroup, css);
+ spin_lock_irqsave(&blkcg->lock, flags);
+ if (!hlist_unhashed(&blkg->blkcg_node)) {
+ __blkiocg_del_blkio_group(blkg);
+ ret = 0;
+ }
+ spin_unlock_irqrestore(&blkcg->lock, flags);
}
- spin_unlock_irqrestore(&blkcg->lock, flags);
-out:
+
rcu_read_unlock();
return ret;
}
struct blkio_group *blkg;
struct hlist_node *n;
struct blkio_policy_type *blkiop;
+ struct blkio_policy_node *pn;
if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
return -EINVAL;
spin_lock(&blkio_list_lock);
spin_lock_irq(&blkcg->lock);
blkcg->weight = (unsigned int)val;
+
hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ pn = blkio_policy_search_node(blkcg, blkg->dev);
+
+ if (pn)
+ continue;
+
list_for_each_entry(blkiop, &blkio_list, list)
blkiop->ops.blkio_update_group_weight_fn(blkg,
blkcg->weight);
return 0;
}
-#define SHOW_FUNCTION_PER_GROUP(__VAR) \
+static int
+blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val)
+{
+ struct blkio_cgroup *blkcg;
+ struct blkio_group *blkg;
+ struct blkio_group_stats *stats;
+ struct hlist_node *n;
+ uint64_t queued[BLKIO_STAT_TOTAL];
+ int i;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ bool idling, waiting, empty;
+ unsigned long long now = sched_clock();
+#endif
+
+ blkcg = cgroup_to_blkio_cgroup(cgroup);
+ spin_lock_irq(&blkcg->lock);
+ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ spin_lock(&blkg->stats_lock);
+ stats = &blkg->stats;
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ idling = blkio_blkg_idling(stats);
+ waiting = blkio_blkg_waiting(stats);
+ empty = blkio_blkg_empty(stats);
+#endif
+ for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+ queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i];
+ memset(stats, 0, sizeof(struct blkio_group_stats));
+ for (i = 0; i < BLKIO_STAT_TOTAL; i++)
+ stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ if (idling) {
+ blkio_mark_blkg_idling(stats);
+ stats->start_idle_time = now;
+ }
+ if (waiting) {
+ blkio_mark_blkg_waiting(stats);
+ stats->start_group_wait_time = now;
+ }
+ if (empty) {
+ blkio_mark_blkg_empty(stats);
+ stats->start_empty_time = now;
+ }
+#endif
+ spin_unlock(&blkg->stats_lock);
+ }
+ spin_unlock_irq(&blkcg->lock);
+ return 0;
+}
+
+static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str,
+ int chars_left, bool diskname_only)
+{
+ snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev));
+ chars_left -= strlen(str);
+ if (chars_left <= 0) {
+ printk(KERN_WARNING
+ "Possibly incorrect cgroup stat display format");
+ return;
+ }
+ if (diskname_only)
+ return;
+ switch (type) {
+ case BLKIO_STAT_READ:
+ strlcat(str, " Read", chars_left);
+ break;
+ case BLKIO_STAT_WRITE:
+ strlcat(str, " Write", chars_left);
+ break;
+ case BLKIO_STAT_SYNC:
+ strlcat(str, " Sync", chars_left);
+ break;
+ case BLKIO_STAT_ASYNC:
+ strlcat(str, " Async", chars_left);
+ break;
+ case BLKIO_STAT_TOTAL:
+ strlcat(str, " Total", chars_left);
+ break;
+ default:
+ strlcat(str, " Invalid", chars_left);
+ }
+}
+
+static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val,
+ struct cgroup_map_cb *cb, dev_t dev)
+{
+ blkio_get_key_name(0, dev, str, chars_left, true);
+ cb->fill(cb, str, val);
+ return val;
+}
+
+/* This should be called with blkg->stats_lock held */
+static uint64_t blkio_get_stat(struct blkio_group *blkg,
+ struct cgroup_map_cb *cb, dev_t dev, enum stat_type type)
+{
+ uint64_t disk_total;
+ char key_str[MAX_KEY_LEN];
+ enum stat_sub_type sub_type;
+
+ if (type == BLKIO_STAT_TIME)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.time, cb, dev);
+ if (type == BLKIO_STAT_SECTORS)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.sectors, cb, dev);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ if (type == BLKIO_STAT_AVG_QUEUE_SIZE) {
+ uint64_t sum = blkg->stats.avg_queue_size_sum;
+ uint64_t samples = blkg->stats.avg_queue_size_samples;
+ if (samples)
+ do_div(sum, samples);
+ else
+ sum = 0;
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev);
+ }
+ if (type == BLKIO_STAT_GROUP_WAIT_TIME)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.group_wait_time, cb, dev);
+ if (type == BLKIO_STAT_IDLE_TIME)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.idle_time, cb, dev);
+ if (type == BLKIO_STAT_EMPTY_TIME)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.empty_time, cb, dev);
+ if (type == BLKIO_STAT_DEQUEUE)
+ return blkio_fill_stat(key_str, MAX_KEY_LEN - 1,
+ blkg->stats.dequeue, cb, dev);
+#endif
+
+ for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL;
+ sub_type++) {
+ blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false);
+ cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]);
+ }
+ disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] +
+ blkg->stats.stat_arr[type][BLKIO_STAT_WRITE];
+ blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false);
+ cb->fill(cb, key_str, disk_total);
+ return disk_total;
+}
+
+#define SHOW_FUNCTION_PER_GROUP(__VAR, type, show_total) \
static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \
- struct cftype *cftype, struct seq_file *m) \
+ struct cftype *cftype, struct cgroup_map_cb *cb) \
{ \
struct blkio_cgroup *blkcg; \
struct blkio_group *blkg; \
struct hlist_node *n; \
+ uint64_t cgroup_total = 0; \
\
if (!cgroup_lock_live_group(cgroup)) \
return -ENODEV; \
blkcg = cgroup_to_blkio_cgroup(cgroup); \
rcu_read_lock(); \
hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\
- if (blkg->dev) \
- seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \
- MINOR(blkg->dev), blkg->__VAR); \
+ if (blkg->dev) { \
+ spin_lock_irq(&blkg->stats_lock); \
+ cgroup_total += blkio_get_stat(blkg, cb, \
+ blkg->dev, type); \
+ spin_unlock_irq(&blkg->stats_lock); \
+ } \
} \
+ if (show_total) \
+ cb->fill(cb, "Total", cgroup_total); \
rcu_read_unlock(); \
cgroup_unlock(); \
return 0; \
}
-SHOW_FUNCTION_PER_GROUP(time);
-SHOW_FUNCTION_PER_GROUP(sectors);
+SHOW_FUNCTION_PER_GROUP(time, BLKIO_STAT_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(sectors, BLKIO_STAT_SECTORS, 0);
+SHOW_FUNCTION_PER_GROUP(io_service_bytes, BLKIO_STAT_SERVICE_BYTES, 1);
+SHOW_FUNCTION_PER_GROUP(io_serviced, BLKIO_STAT_SERVICED, 1);
+SHOW_FUNCTION_PER_GROUP(io_service_time, BLKIO_STAT_SERVICE_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_wait_time, BLKIO_STAT_WAIT_TIME, 1);
+SHOW_FUNCTION_PER_GROUP(io_merged, BLKIO_STAT_MERGED, 1);
+SHOW_FUNCTION_PER_GROUP(io_queued, BLKIO_STAT_QUEUED, 1);
#ifdef CONFIG_DEBUG_BLK_CGROUP
-SHOW_FUNCTION_PER_GROUP(dequeue);
+SHOW_FUNCTION_PER_GROUP(dequeue, BLKIO_STAT_DEQUEUE, 0);
+SHOW_FUNCTION_PER_GROUP(avg_queue_size, BLKIO_STAT_AVG_QUEUE_SIZE, 0);
+SHOW_FUNCTION_PER_GROUP(group_wait_time, BLKIO_STAT_GROUP_WAIT_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(idle_time, BLKIO_STAT_IDLE_TIME, 0);
+SHOW_FUNCTION_PER_GROUP(empty_time, BLKIO_STAT_EMPTY_TIME, 0);
#endif
#undef SHOW_FUNCTION_PER_GROUP
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
- unsigned long dequeue)
+static int blkio_check_dev_num(dev_t dev)
{
- blkg->dequeue += dequeue;
+ int part = 0;
+ struct gendisk *disk;
+
+ disk = get_gendisk(dev, &part);
+ if (!disk || part)
+ return -ENODEV;
+
+ return 0;
+}
+
+static int blkio_policy_parse_and_set(char *buf,
+ struct blkio_policy_node *newpn)
+{
+ char *s[4], *p, *major_s = NULL, *minor_s = NULL;
+ int ret;
+ unsigned long major, minor, temp;
+ int i = 0;
+ dev_t dev;
+
+ memset(s, 0, sizeof(s));
+
+ while ((p = strsep(&buf, " ")) != NULL) {
+ if (!*p)
+ continue;
+
+ s[i++] = p;
+
+ /* Prevent from inputing too many things */
+ if (i == 3)
+ break;
+ }
+
+ if (i != 2)
+ return -EINVAL;
+
+ p = strsep(&s[0], ":");
+ if (p != NULL)
+ major_s = p;
+ else
+ return -EINVAL;
+
+ minor_s = s[0];
+ if (!minor_s)
+ return -EINVAL;
+
+ ret = strict_strtoul(major_s, 10, &major);
+ if (ret)
+ return -EINVAL;
+
+ ret = strict_strtoul(minor_s, 10, &minor);
+ if (ret)
+ return -EINVAL;
+
+ dev = MKDEV(major, minor);
+
+ ret = blkio_check_dev_num(dev);
+ if (ret)
+ return ret;
+
+ newpn->dev = dev;
+
+ if (s[1] == NULL)
+ return -EINVAL;
+
+ ret = strict_strtoul(s[1], 10, &temp);
+ if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
+ temp > BLKIO_WEIGHT_MAX)
+ return -EINVAL;
+
+ newpn->weight = temp;
+
+ return 0;
+}
+
+unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+ dev_t dev)
+{
+ struct blkio_policy_node *pn;
+
+ pn = blkio_policy_search_node(blkcg, dev);
+ if (pn)
+ return pn->weight;
+ else
+ return blkcg->weight;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_weight);
+
+
+static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ int ret = 0;
+ char *buf;
+ struct blkio_policy_node *newpn, *pn;
+ struct blkio_cgroup *blkcg;
+ struct blkio_group *blkg;
+ int keep_newpn = 0;
+ struct hlist_node *n;
+ struct blkio_policy_type *blkiop;
+
+ buf = kstrdup(buffer, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
+ if (!newpn) {
+ ret = -ENOMEM;
+ goto free_buf;
+ }
+
+ ret = blkio_policy_parse_and_set(buf, newpn);
+ if (ret)
+ goto free_newpn;
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+
+ spin_lock_irq(&blkcg->lock);
+
+ pn = blkio_policy_search_node(blkcg, newpn->dev);
+ if (!pn) {
+ if (newpn->weight != 0) {
+ blkio_policy_insert_node(blkcg, newpn);
+ keep_newpn = 1;
+ }
+ spin_unlock_irq(&blkcg->lock);
+ goto update_io_group;
+ }
+
+ if (newpn->weight == 0) {
+ /* weight == 0 means deleteing a specific weight */
+ blkio_policy_delete_node(pn);
+ spin_unlock_irq(&blkcg->lock);
+ goto update_io_group;
+ }
+ spin_unlock_irq(&blkcg->lock);
+
+ pn->weight = newpn->weight;
+
+update_io_group:
+ /* update weight for each cfqg */
+ spin_lock(&blkio_list_lock);
+ spin_lock_irq(&blkcg->lock);
+
+ hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
+ if (newpn->dev == blkg->dev) {
+ list_for_each_entry(blkiop, &blkio_list, list)
+ blkiop->ops.blkio_update_group_weight_fn(blkg,
+ newpn->weight ?
+ newpn->weight :
+ blkcg->weight);
+ }
+ }
+
+ spin_unlock_irq(&blkcg->lock);
+ spin_unlock(&blkio_list_lock);
+
+free_newpn:
+ if (!keep_newpn)
+ kfree(newpn);
+free_buf:
+ kfree(buf);
+ return ret;
+}
+
+static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct blkio_cgroup *blkcg;
+ struct blkio_policy_node *pn;
+
+ seq_printf(m, "dev\tweight\n");
+
+ blkcg = cgroup_to_blkio_cgroup(cgrp);
+ if (!list_empty(&blkcg->policy_list)) {
+ spin_lock_irq(&blkcg->lock);
+ list_for_each_entry(pn, &blkcg->policy_list, node) {
+ seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
+ MINOR(pn->dev), pn->weight);
+ }
+ spin_unlock_irq(&blkcg->lock);
+ }
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
-#endif
struct cftype blkio_files[] = {
+ {
+ .name = "weight_device",
+ .read_seq_string = blkiocg_weight_device_read,
+ .write_string = blkiocg_weight_device_write,
+ .max_write_len = 256,
+ },
{
.name = "weight",
.read_u64 = blkiocg_weight_read,
},
{
.name = "time",
- .read_seq_string = blkiocg_time_read,
+ .read_map = blkiocg_time_read,
},
{
.name = "sectors",
- .read_seq_string = blkiocg_sectors_read,
+ .read_map = blkiocg_sectors_read,
+ },
+ {
+ .name = "io_service_bytes",
+ .read_map = blkiocg_io_service_bytes_read,
+ },
+ {
+ .name = "io_serviced",
+ .read_map = blkiocg_io_serviced_read,
+ },
+ {
+ .name = "io_service_time",
+ .read_map = blkiocg_io_service_time_read,
+ },
+ {
+ .name = "io_wait_time",
+ .read_map = blkiocg_io_wait_time_read,
+ },
+ {
+ .name = "io_merged",
+ .read_map = blkiocg_io_merged_read,
+ },
+ {
+ .name = "io_queued",
+ .read_map = blkiocg_io_queued_read,
+ },
+ {
+ .name = "reset_stats",
+ .write_u64 = blkiocg_reset_stats,
},
#ifdef CONFIG_DEBUG_BLK_CGROUP
- {
+ {
+ .name = "avg_queue_size",
+ .read_map = blkiocg_avg_queue_size_read,
+ },
+ {
+ .name = "group_wait_time",
+ .read_map = blkiocg_group_wait_time_read,
+ },
+ {
+ .name = "idle_time",
+ .read_map = blkiocg_idle_time_read,
+ },
+ {
+ .name = "empty_time",
+ .read_map = blkiocg_empty_time_read,
+ },
+ {
.name = "dequeue",
- .read_seq_string = blkiocg_dequeue_read,
- },
+ .read_map = blkiocg_dequeue_read,
+ },
#endif
};
struct blkio_group *blkg;
void *key;
struct blkio_policy_type *blkiop;
+ struct blkio_policy_node *pn, *pntmp;
rcu_read_lock();
-remove_entry:
- spin_lock_irqsave(&blkcg->lock, flags);
+ do {
+ spin_lock_irqsave(&blkcg->lock, flags);
+
+ if (hlist_empty(&blkcg->blkg_list)) {
+ spin_unlock_irqrestore(&blkcg->lock, flags);
+ break;
+ }
+
+ blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
+ blkcg_node);
+ key = rcu_dereference(blkg->key);
+ __blkiocg_del_blkio_group(blkg);
- if (hlist_empty(&blkcg->blkg_list)) {
spin_unlock_irqrestore(&blkcg->lock, flags);
- goto done;
- }
- blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group,
- blkcg_node);
- key = rcu_dereference(blkg->key);
- __blkiocg_del_blkio_group(blkg);
+ /*
+ * This blkio_group is being unlinked as associated cgroup is
+ * going away. Let all the IO controlling policies know about
+ * this event. Currently this is static call to one io
+ * controlling policy. Once we have more policies in place, we
+ * need some dynamic registration of callback function.
+ */
+ spin_lock(&blkio_list_lock);
+ list_for_each_entry(blkiop, &blkio_list, list)
+ blkiop->ops.blkio_unlink_group_fn(key, blkg);
+ spin_unlock(&blkio_list_lock);
+ } while (1);
- spin_unlock_irqrestore(&blkcg->lock, flags);
+ list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
+ blkio_policy_delete_node(pn);
+ kfree(pn);
+ }
- /*
- * This blkio_group is being unlinked as associated cgroup is going
- * away. Let all the IO controlling policies know about this event.
- *
- * Currently this is static call to one io controlling policy. Once
- * we have more policies in place, we need some dynamic registration
- * of callback function.
- */
- spin_lock(&blkio_list_lock);
- list_for_each_entry(blkiop, &blkio_list, list)
- blkiop->ops.blkio_unlink_group_fn(key, blkg);
- spin_unlock(&blkio_list_lock);
- goto remove_entry;
-done:
free_css_id(&blkio_subsys, &blkcg->css);
rcu_read_unlock();
if (blkcg != &blkio_root_cgroup)
spin_lock_init(&blkcg->lock);
INIT_HLIST_HEAD(&blkcg->blkg_list);
+ INIT_LIST_HEAD(&blkcg->policy_list);
return &blkcg->css;
}
#define blkio_subsys_id blkio_subsys.subsys_id
#endif
+enum stat_type {
+ /* Total time spent (in ns) between request dispatch to the driver and
+ * request completion for IOs doen by this cgroup. This may not be
+ * accurate when NCQ is turned on. */
+ BLKIO_STAT_SERVICE_TIME = 0,
+ /* Total bytes transferred */
+ BLKIO_STAT_SERVICE_BYTES,
+ /* Total IOs serviced, post merge */
+ BLKIO_STAT_SERVICED,
+ /* Total time spent waiting in scheduler queue in ns */
+ BLKIO_STAT_WAIT_TIME,
+ /* Number of IOs merged */
+ BLKIO_STAT_MERGED,
+ /* Number of IOs queued up */
+ BLKIO_STAT_QUEUED,
+ /* All the single valued stats go below this */
+ BLKIO_STAT_TIME,
+ BLKIO_STAT_SECTORS,
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ BLKIO_STAT_AVG_QUEUE_SIZE,
+ BLKIO_STAT_IDLE_TIME,
+ BLKIO_STAT_EMPTY_TIME,
+ BLKIO_STAT_GROUP_WAIT_TIME,
+ BLKIO_STAT_DEQUEUE
+#endif
+};
+
+enum stat_sub_type {
+ BLKIO_STAT_READ = 0,
+ BLKIO_STAT_WRITE,
+ BLKIO_STAT_SYNC,
+ BLKIO_STAT_ASYNC,
+ BLKIO_STAT_TOTAL
+};
+
+/* blkg state flags */
+enum blkg_state_flags {
+ BLKG_waiting = 0,
+ BLKG_idling,
+ BLKG_empty,
+};
+
struct blkio_cgroup {
struct cgroup_subsys_state css;
unsigned int weight;
spinlock_t lock;
struct hlist_head blkg_list;
+ struct list_head policy_list; /* list of blkio_policy_node */
+};
+
+struct blkio_group_stats {
+ /* total disk time and nr sectors dispatched by this group */
+ uint64_t time;
+ uint64_t sectors;
+ uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL];
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+ /* Sum of number of IOs queued across all samples */
+ uint64_t avg_queue_size_sum;
+ /* Count of samples taken for average */
+ uint64_t avg_queue_size_samples;
+ /* How many times this group has been removed from service tree */
+ unsigned long dequeue;
+
+ /* Total time spent waiting for it to be assigned a timeslice. */
+ uint64_t group_wait_time;
+ uint64_t start_group_wait_time;
+
+ /* Time spent idling for this blkio_group */
+ uint64_t idle_time;
+ uint64_t start_idle_time;
+ /*
+ * Total time when we have requests queued and do not contain the
+ * current active queue.
+ */
+ uint64_t empty_time;
+ uint64_t start_empty_time;
+ uint16_t flags;
+#endif
};
struct blkio_group {
void *key;
struct hlist_node blkcg_node;
unsigned short blkcg_id;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
/* Store cgroup path */
char path[128];
- /* How many times this group has been removed from service tree */
- unsigned long dequeue;
-#endif
/* The device MKDEV(major, minor), this group has been created for */
- dev_t dev;
+ dev_t dev;
- /* total disk time and nr sectors dispatched by this group */
- unsigned long time;
- unsigned long sectors;
+ /* Need to serialize the stats in the case of reset/update */
+ spinlock_t stats_lock;
+ struct blkio_group_stats stats;
};
+struct blkio_policy_node {
+ struct list_head node;
+ dev_t dev;
+ unsigned int weight;
+};
+
+extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
+ dev_t dev);
+
typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
unsigned int weight);
extern void blkio_policy_register(struct blkio_policy_type *);
extern void blkio_policy_unregister(struct blkio_policy_type *);
+static inline char *blkg_path(struct blkio_group *blkg)
+{
+ return blkg->path;
+}
+
#else
struct blkio_group {
static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { }
static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { }
+static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
+
#endif
#define BLKIO_WEIGHT_MIN 100
#define BLKIO_WEIGHT_DEFAULT 500
#ifdef CONFIG_DEBUG_BLK_CGROUP
-static inline char *blkg_path(struct blkio_group *blkg)
-{
- return blkg->path;
-}
-void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
+void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg);
+void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
unsigned long dequeue);
+void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_update_idle_time_stats(struct blkio_group *blkg);
+void blkiocg_set_start_empty_time(struct blkio_group *blkg);
+
+#define BLKG_FLAG_FNS(name) \
+static inline void blkio_mark_blkg_##name( \
+ struct blkio_group_stats *stats) \
+{ \
+ stats->flags |= (1 << BLKG_##name); \
+} \
+static inline void blkio_clear_blkg_##name( \
+ struct blkio_group_stats *stats) \
+{ \
+ stats->flags &= ~(1 << BLKG_##name); \
+} \
+static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \
+{ \
+ return (stats->flags & (1 << BLKG_##name)) != 0; \
+} \
+
+BLKG_FLAG_FNS(waiting)
+BLKG_FLAG_FNS(idling)
+BLKG_FLAG_FNS(empty)
+#undef BLKG_FLAG_FNS
#else
-static inline char *blkg_path(struct blkio_group *blkg) { return NULL; }
-static inline void blkiocg_update_blkio_group_dequeue_stats(
- struct blkio_group *blkg, unsigned long dequeue) {}
+static inline void blkiocg_update_avg_queue_size_stats(
+ struct blkio_group *blkg) {}
+static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg,
+ unsigned long dequeue) {}
+static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg)
+{}
+static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {}
+static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {}
#endif
#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
void *key);
-void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
- unsigned long time, unsigned long sectors);
+void blkiocg_update_timeslice_used(struct blkio_group *blkg,
+ unsigned long time);
+void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes,
+ bool direction, bool sync);
+void blkiocg_update_completion_stats(struct blkio_group *blkg,
+ uint64_t start_time, uint64_t io_start_time, bool direction, bool sync);
+void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction,
+ bool sync);
+void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg, bool direction, bool sync);
+void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+ bool direction, bool sync);
#else
struct cgroup;
static inline struct blkio_cgroup *
cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; }
static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
- struct blkio_group *blkg, void *key, dev_t dev)
-{
-}
+ struct blkio_group *blkg, void *key, dev_t dev) {}
static inline int
blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
static inline struct blkio_group *
blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; }
-static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg,
- unsigned long time, unsigned long sectors)
-{
-}
+static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg,
+ unsigned long time) {}
+static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg,
+ uint64_t bytes, bool direction, bool sync) {}
+static inline void blkiocg_update_completion_stats(struct blkio_group *blkg,
+ uint64_t start_time, uint64_t io_start_time, bool direction,
+ bool sync) {}
+static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg,
+ bool direction, bool sync) {}
+static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg,
+ struct blkio_group *curr_blkg, bool direction, bool sync) {}
+static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg,
+ bool direction, bool sync) {}
#endif
#endif /* _BLK_CGROUP_H */
rq->tag = -1;
rq->ref_count = 1;
rq->start_time = jiffies;
+ set_start_time_ns(rq);
}
EXPORT_SYMBOL(blk_rq_init);
*/
blk_sync_queue(q);
+ del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
mutex_lock(&q->sysfs_lock);
queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
mutex_unlock(&q->sysfs_lock);
return NULL;
}
+ setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
+ laptop_mode_timer_fn, (unsigned long) q);
init_timer(&q->unplug_timer);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
INIT_LIST_HEAD(&q->timeout_list);
{
struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
+ return blk_init_allocated_queue_node(q, rfn, lock, node_id);
+}
+EXPORT_SYMBOL(blk_init_queue_node);
+
+struct request_queue *
+blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
+ spinlock_t *lock)
+{
+ return blk_init_allocated_queue_node(q, rfn, lock, -1);
+}
+EXPORT_SYMBOL(blk_init_allocated_queue);
+
+struct request_queue *
+blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
+ spinlock_t *lock, int node_id)
+{
if (!q)
return NULL;
blk_put_queue(q);
return NULL;
}
-EXPORT_SYMBOL(blk_init_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue_node);
int blk_get_queue(struct request_queue *q)
{
if (!blk_rq_cpu_valid(req))
req->cpu = bio->bi_comp_cpu;
drive_stat_acct(req, 0);
+ elv_bio_merged(q, req, bio);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out;
if (!blk_rq_cpu_valid(req))
req->cpu = bio->bi_comp_cpu;
drive_stat_acct(req, 0);
+ elv_bio_merged(q, req, bio);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out;
* and to it is freed is accounted as io that is in progress at
* the driver side.
*/
- if (blk_account_rq(rq))
+ if (blk_account_rq(rq)) {
q->in_flight[rq_is_sync(rq)]++;
+ set_io_start_time_ns(rq);
+ }
}
/**
BUG_ON(blk_queued_rq(req));
if (unlikely(laptop_mode) && blk_fs_request(req))
- laptop_io_completion();
+ laptop_io_completion(&req->q->backing_dev_info);
blk_delete_timer(req);
return 0;
}
-
--- /dev/null
+/*
+ * Functions related to generic helpers functions
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+
+#include "blk.h"
+
+static void blkdev_discard_end_io(struct bio *bio, int err)
+{
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ }
+
+ if (bio->bi_private)
+ complete(bio->bi_private);
+ __free_page(bio_page(bio));
+
+ bio_put(bio);
+}
+
+/**
+ * blkdev_issue_discard - queue a discard
+ * @bdev: blockdev to issue discard for
+ * @sector: start sector
+ * @nr_sects: number of sectors to discard
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @flags: BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ * Issue a discard request for the sectors in question.
+ */
+int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ struct request_queue *q = bdev_get_queue(bdev);
+ int type = flags & BLKDEV_IFL_BARRIER ?
+ DISCARD_BARRIER : DISCARD_NOBARRIER;
+ struct bio *bio;
+ struct page *page;
+ int ret = 0;
+
+ if (!q)
+ return -ENXIO;
+
+ if (!blk_queue_discard(q))
+ return -EOPNOTSUPP;
+
+ while (nr_sects && !ret) {
+ unsigned int sector_size = q->limits.logical_block_size;
+ unsigned int max_discard_sectors =
+ min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+
+ bio = bio_alloc(gfp_mask, 1);
+ if (!bio)
+ goto out;
+ bio->bi_sector = sector;
+ bio->bi_end_io = blkdev_discard_end_io;
+ bio->bi_bdev = bdev;
+ if (flags & BLKDEV_IFL_WAIT)
+ bio->bi_private = &wait;
+
+ /*
+ * Add a zeroed one-sector payload as that's what
+ * our current implementations need. If we'll ever need
+ * more the interface will need revisiting.
+ */
+ page = alloc_page(gfp_mask | __GFP_ZERO);
+ if (!page)
+ goto out_free_bio;
+ if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size)
+ goto out_free_page;
+
+ /*
+ * And override the bio size - the way discard works we
+ * touch many more blocks on disk than the actual payload
+ * length.
+ */
+ if (nr_sects > max_discard_sectors) {
+ bio->bi_size = max_discard_sectors << 9;
+ nr_sects -= max_discard_sectors;
+ sector += max_discard_sectors;
+ } else {
+ bio->bi_size = nr_sects << 9;
+ nr_sects = 0;
+ }
+
+ bio_get(bio);
+ submit_bio(type, bio);
+
+ if (flags & BLKDEV_IFL_WAIT)
+ wait_for_completion(&wait);
+
+ if (bio_flagged(bio, BIO_EOPNOTSUPP))
+ ret = -EOPNOTSUPP;
+ else if (!bio_flagged(bio, BIO_UPTODATE))
+ ret = -EIO;
+ bio_put(bio);
+ }
+ return ret;
+out_free_page:
+ __free_page(page);
+out_free_bio:
+ bio_put(bio);
+out:
+ return -ENOMEM;
+}
+EXPORT_SYMBOL(blkdev_issue_discard);
+
+struct bio_batch
+{
+ atomic_t done;
+ unsigned long flags;
+ struct completion *wait;
+ bio_end_io_t *end_io;
+};
+
+static void bio_batch_end_io(struct bio *bio, int err)
+{
+ struct bio_batch *bb = bio->bi_private;
+
+ if (err) {
+ if (err == -EOPNOTSUPP)
+ set_bit(BIO_EOPNOTSUPP, &bb->flags);
+ else
+ clear_bit(BIO_UPTODATE, &bb->flags);
+ }
+ if (bb) {
+ if (bb->end_io)
+ bb->end_io(bio, err);
+ atomic_inc(&bb->done);
+ complete(bb->wait);
+ }
+ bio_put(bio);
+}
+
+/**
+ * blkdev_issue_zeroout generate number of zero filed write bios
+ * @bdev: blockdev to issue
+ * @sector: start sector
+ * @nr_sects: number of sectors to write
+ * @gfp_mask: memory allocation flags (for bio_alloc)
+ * @flags: BLKDEV_IFL_* flags to control behaviour
+ *
+ * Description:
+ * Generate and issue number of bios with zerofiled pages.
+ * Send barrier at the beginning and at the end if requested. This guarantie
+ * correct request ordering. Empty barrier allow us to avoid post queue flush.
+ */
+
+int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+{
+ int ret = 0;
+ struct bio *bio;
+ struct bio_batch bb;
+ unsigned int sz, issued = 0;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ atomic_set(&bb.done, 0);
+ bb.flags = 1 << BIO_UPTODATE;
+ bb.wait = &wait;
+ bb.end_io = NULL;
+
+ if (flags & BLKDEV_IFL_BARRIER) {
+ /* issue async barrier before the data */
+ ret = blkdev_issue_flush(bdev, gfp_mask, NULL, 0);
+ if (ret)
+ return ret;
+ }
+submit:
+ while (nr_sects != 0) {
+ bio = bio_alloc(gfp_mask,
+ min(nr_sects, (sector_t)BIO_MAX_PAGES));
+ if (!bio)
+ break;
+
+ bio->bi_sector = sector;
+ bio->bi_bdev = bdev;
+ bio->bi_end_io = bio_batch_end_io;
+ if (flags & BLKDEV_IFL_WAIT)
+ bio->bi_private = &bb;
+
+ while (nr_sects != 0) {
+ sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects);
+ if (sz == 0)
+ /* bio has maximum size possible */
+ break;
+ ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0);
+ nr_sects -= ret >> 9;
+ sector += ret >> 9;
+ if (ret < (sz << 9))
+ break;
+ }
+ issued++;
+ submit_bio(WRITE, bio);
+ }
+ /*
+ * When all data bios are in flight. Send final barrier if requeted.
+ */
+ if (nr_sects == 0 && flags & BLKDEV_IFL_BARRIER)
+ ret = blkdev_issue_flush(bdev, gfp_mask, NULL,
+ flags & BLKDEV_IFL_WAIT);
+
+
+ if (flags & BLKDEV_IFL_WAIT)
+ /* Wait for bios in-flight */
+ while ( issued != atomic_read(&bb.done))
+ wait_for_completion(&wait);
+
+ if (!test_bit(BIO_UPTODATE, &bb.flags))
+ /* One of bios in the batch was completed with error.*/
+ ret = -EIO;
+
+ if (ret)
+ goto out;
+
+ if (test_bit(BIO_EOPNOTSUPP, &bb.flags)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ if (nr_sects != 0)
+ goto submit;
+out:
+ return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_zeroout);
#define RQ_CIC(rq) \
((struct cfq_io_context *) (rq)->elevator_private)
#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
+#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
struct cfq_queue *new_cfqq;
struct cfq_group *cfqg;
struct cfq_group *orig_cfqg;
- /* Sectors dispatched in current dispatch round */
- unsigned long nr_sectors;
};
/*
CFQ_CFQQ_FNS(wait_busy);
#undef CFQ_CFQQ_FNS
-#ifdef CONFIG_DEBUG_CFQ_IOSCHED
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
if (!RB_EMPTY_NODE(&cfqg->rb_node))
cfq_rb_erase(&cfqg->rb_node, st);
cfqg->saved_workload_slice = 0;
- blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
+ blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
}
static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
slice_used = cfqq->allocated_slice;
}
- cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
- cfqq->nr_sectors);
+ cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
return slice_used;
}
cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
st->min_vdisktime);
- blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
- cfqq->nr_sectors);
+ blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
+ blkiocg_set_start_empty_time(&cfqg->blkg);
}
#ifdef CONFIG_CFQ_GROUP_IOSCHED
if (!cfqg)
goto done;
- cfqg->weight = blkcg->weight;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
RB_CLEAR_NODE(&cfqg->rb_node);
sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
MKDEV(major, minor));
+ cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
/* Add group on cfqd list */
hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
return cfqg;
}
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+ atomic_inc(&cfqg->ref);
+ return cfqg;
+}
+
static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
{
/* Currently, all async queues are mapped to root group */
{
return &cfqd->root_group;
}
+
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
+{
+ return cfqg;
+}
+
static inline void
cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
cfqq->cfqg = cfqg;
{
elv_rb_del(&cfqq->sort_list, rq);
cfqq->queued[rq_is_sync(rq)]--;
+ blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
+ rq_is_sync(rq));
cfq_add_rq_rb(rq);
+ blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+ &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
+ rq_is_sync(rq));
}
static struct request *
cfq_del_rq_rb(rq);
cfqq->cfqd->rq_queued--;
+ blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
+ rq_is_sync(rq));
if (rq_is_meta(rq)) {
WARN_ON(!cfqq->meta_pending);
cfqq->meta_pending--;
}
}
+static void cfq_bio_merged(struct request_queue *q, struct request *req,
+ struct bio *bio)
+{
+ blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
+ cfq_bio_sync(bio));
+}
+
static void
cfq_merged_requests(struct request_queue *q, struct request *rq,
struct request *next)
if (cfqq->next_rq == next)
cfqq->next_rq = rq;
cfq_remove_request(next);
+ blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
+ rq_is_sync(next));
}
static int cfq_allow_merge(struct request_queue *q, struct request *rq,
return cfqq == RQ_CFQQ(rq);
}
+static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+ del_timer(&cfqd->idle_slice_timer);
+ blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
+}
+
static void __cfq_set_active_queue(struct cfq_data *cfqd,
struct cfq_queue *cfqq)
{
if (cfqq) {
cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
cfqd->serving_prio, cfqd->serving_type);
+ blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
cfqq->slice_start = 0;
cfqq->dispatch_start = jiffies;
cfqq->allocated_slice = 0;
cfqq->slice_end = 0;
cfqq->slice_dispatch = 0;
- cfqq->nr_sectors = 0;
cfq_clear_cfqq_wait_request(cfqq);
cfq_clear_cfqq_must_dispatch(cfqq);
cfq_clear_cfqq_fifo_expire(cfqq);
cfq_mark_cfqq_slice_new(cfqq);
- del_timer(&cfqd->idle_slice_timer);
+ cfq_del_timer(cfqd, cfqq);
}
cfqd->active_queue = cfqq;
cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
if (cfq_cfqq_wait_request(cfqq))
- del_timer(&cfqd->idle_slice_timer);
+ cfq_del_timer(cfqd, cfqq);
cfq_clear_cfqq_wait_request(cfqq);
cfq_clear_cfqq_wait_busy(cfqq);
sl = cfqd->cfq_slice_idle;
mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
+ blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
}
elv_dispatch_sort(q, rq);
cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
- cfqq->nr_sectors += blk_rq_sectors(rq);
+ blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
+ rq_data_dir(rq), rq_is_sync(rq));
}
/*
if (cfq_cfqq_wait_request(cfqq)) {
if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
cfqd->busy_queues > 1) {
- del_timer(&cfqd->idle_slice_timer);
+ cfq_del_timer(cfqd, cfqq);
cfq_clear_cfqq_wait_request(cfqq);
__blk_run_queue(cfqd->queue);
- } else
+ } else {
+ blkiocg_update_idle_time_stats(
+ &cfqq->cfqg->blkg);
cfq_mark_cfqq_must_dispatch(cfqq);
+ }
}
} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
/*
rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
list_add_tail(&rq->queuelist, &cfqq->fifo);
cfq_add_rq_rb(rq);
-
+ blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
+ &cfqd->serving_group->blkg, rq_data_dir(rq),
+ rq_is_sync(rq));
cfq_rq_enqueued(cfqd, cfqq, rq);
}
WARN_ON(!cfqq->dispatched);
cfqd->rq_in_driver--;
cfqq->dispatched--;
+ blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
+ rq_io_start_time_ns(rq), rq_data_dir(rq),
+ rq_is_sync(rq));
cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
rq->elevator_private = NULL;
rq->elevator_private2 = NULL;
+ /* Put down rq reference on cfqg */
+ cfq_put_cfqg(RQ_CFQG(rq));
+ rq->elevator_private3 = NULL;
+
cfq_put_queue(cfqq);
}
}
rq->elevator_private = cic;
rq->elevator_private2 = cfqq;
+ rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
return 0;
queue_fail:
* second, in order to have larger depth for async operations.
*/
cfqd->last_delayed_sync = jiffies - HZ;
- INIT_RCU_HEAD(&cfqd->rcu);
return cfqd;
}
.elevator_merged_fn = cfq_merged_request,
.elevator_merge_req_fn = cfq_merged_requests,
.elevator_allow_merge_fn = cfq_allow_merge,
+ .elevator_bio_merged_fn = cfq_bio_merged,
.elevator_dispatch_fn = cfq_dispatch_requests,
.elevator_add_req_fn = cfq_insert_request,
.elevator_activate_req_fn = cfq_activate_request,
q->last_merge = rq;
}
+void elv_bio_merged(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ struct elevator_queue *e = q->elevator;
+
+ if (e->ops->elevator_bio_merged_fn)
+ e->ops->elevator_bio_merged_fn(q, rq, bio);
+}
+
void elv_requeue_request(struct request_queue *q, struct request *rq)
{
/*
}
return error;
}
+EXPORT_SYMBOL(elv_register_queue);
static void __elv_unregister_queue(struct elevator_queue *e)
{
if (q)
__elv_unregister_queue(q->elevator);
}
+EXPORT_SYMBOL(elv_unregister_queue);
void elv_register(struct elevator_type *e)
{
return disk;
}
+EXPORT_SYMBOL(get_gendisk);
/**
* bdget_disk - do bdget() by gendisk and partition number
if (!new_ptbl)
return -ENOMEM;
- INIT_RCU_HEAD(&new_ptbl->rcu_head);
new_ptbl->len = target;
for (i = 0; i < len; i++)
if (start + len > (bdev->bd_inode->i_size >> 9))
return -EINVAL;
return blkdev_issue_discard(bdev, start, len, GFP_KERNEL,
- DISCARD_FL_WAIT);
+ BLKDEV_IFL_WAIT);
}
static int put_ushort(unsigned long arg, unsigned short val)
It's pretty unlikely that you have one of these: say N.
+config GDROM
+ tristate "SEGA Dreamcast GD-ROM drive"
+ depends on SH_DREAMCAST
+ help
+ A standard SEGA Dreamcast comes with a modified CD ROM drive called a
+ "GD-ROM" by SEGA to signify it is capable of reading special disks
+ with up to 1 GB of data. This drive will also read standard CD ROM
+ disks. Select this option to access any disks in your GD ROM drive.
+ Most users will want to say "Y" here.
+ You can also build this as a module which will be called gdrom.
+
config PARIDE
tristate "Parallel port IDE device support"
depends on PARPORT_PC
"MicroSolutions backpack protocol", "DataStor Commuter protocol"
etc.).
-config GDROM
- tristate "SEGA Dreamcast GD-ROM drive"
- depends on SH_DREAMCAST
- help
- A standard SEGA Dreamcast comes with a modified CD ROM drive called a
- "GD-ROM" by SEGA to signify it is capable of reading special disks
- with up to 1 GB of data. This drive will also read standard CD ROM
- disks. Select this option to access any disks in your GD ROM drive.
- Most users will want to say "Y" here.
- You can also build this as a module which will be called gdrom.
-
source "drivers/block/paride/Kconfig"
config BLK_CPQ_DA
#define BM_MD_IO_ERROR 1
#define BM_P_VMALLOCED 2
+static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+ unsigned long e, int val, const enum km_type km);
+
static int bm_is_locked(struct drbd_bitmap *b)
{
return test_bit(BM_LOCKED, &b->bm_flags);
* In case this is actually a resize, we copy the old bitmap into the new one.
* Otherwise, the bitmap is initialized to all bits set.
*/
-int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity)
+int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long bits, words, owords, obits, *p_addr, *bm;
obits = b->bm_bits;
growing = bits > obits;
- if (opages)
+ if (opages && growing && set_new_bits)
bm_set_surplus(b);
b->bm_pages = npages;
b->bm_dev_capacity = capacity;
if (growing) {
- bm_memset(b, owords, 0xff, words-owords);
- b->bm_set += bits - obits;
+ if (set_new_bits) {
+ bm_memset(b, owords, 0xff, words-owords);
+ b->bm_set += bits - obits;
+ } else
+ bm_memset(b, owords, 0x00, words-owords);
+
}
if (want < have) {
/* nothing to do, on disk == in memory */
# define bm_cpu_to_lel(x) ((void)0)
# else
-void bm_cpu_to_lel(struct drbd_bitmap *b)
+static void bm_cpu_to_lel(struct drbd_bitmap *b)
{
/* need to cpu_to_lel all the pages ...
* this may be optimized by using
* wants bitnr, not sector.
* expected to be called for only a few bits (e - s about BITS_PER_LONG).
* Must hold bitmap lock already. */
-int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
unsigned long e, int val, const enum km_type km)
{
struct drbd_bitmap *b = mdev->bitmap;
* for val != 0, we change 0 -> 1, return code positive
* for val == 0, we change 1 -> 0, return code negative
* wants bitnr, not sector */
-int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
+static int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
const unsigned long e, int val)
{
unsigned long flags;
DRBD_FAULT_DT_RA = 6, /* data read ahead */
DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
DRBD_FAULT_AL_EE = 8, /* alloc ee */
+ DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
DRBD_FAULT_MAX,
};
P_RS_IS_IN_SYNC = 0x22, /* meta socket */
P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
+ /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */
+ /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */
+ P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
- P_MAX_CMD = 0x25,
+ P_MAX_CMD = 0x28,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,
[P_CSUM_RS_REQUEST] = "CsumRSRequest",
[P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
[P_COMPRESSED_BITMAP] = "CBitmap",
+ [P_DELAY_PROBE] = "DelayProbe",
[P_MAX_CMD] = NULL,
};
u64 u_size; /* user requested size */
u64 c_size; /* current exported size */
u32 max_segment_size; /* Maximal size of a BIO */
- u32 queue_order_type;
+ u16 queue_order_type; /* not yet implemented in DRBD*/
+ u16 dds_flags; /* use enum dds_flags here. */
} __packed;
struct p_state {
u8 code[0];
} __packed;
+struct p_delay_probe {
+ struct p_header head;
+ u32 seq_num; /* sequence number to match the two probe packets */
+ u32 offset; /* usecs the probe got sent after the reference time point */
+} __packed;
+
+struct delay_probe {
+ struct list_head list;
+ unsigned int seq_num;
+ struct timeval time;
+};
+
/* DCBP: Drbd Compressed Bitmap Packet ... */
static inline enum drbd_bitmap_code
DCBP_get_code(struct p_compressed_bm *p)
EV_CLEANUP = 32, /* used as flag */
};
-struct drbd_epoch_entry {
- struct drbd_work w;
- struct drbd_conf *mdev;
- struct bio *private_bio;
- struct hlist_node colision;
- sector_t sector;
- unsigned int size;
- struct drbd_epoch *epoch;
-
- /* up to here, the struct layout is identical to drbd_request;
- * we might be able to use that to our advantage... */
-
- unsigned int flags;
- u64 block_id;
-};
-
struct drbd_wq_barrier {
struct drbd_work w;
struct completion done;
void *digest;
};
-/* ee flag bits */
+struct drbd_epoch_entry {
+ struct drbd_work w;
+ struct hlist_node colision;
+ struct drbd_epoch *epoch;
+ struct drbd_conf *mdev;
+ struct page *pages;
+ atomic_t pending_bios;
+ unsigned int size;
+ /* see comments on ee flag bits below */
+ unsigned long flags;
+ sector_t sector;
+ u64 block_id;
+};
+
+/* ee flag bits.
+ * While corresponding bios are in flight, the only modification will be
+ * set_bit WAS_ERROR, which has to be atomic.
+ * If no bios are in flight yet, or all have been completed,
+ * non-atomic modification to ee->flags is ok.
+ */
enum {
__EE_CALL_AL_COMPLETE_IO,
- __EE_CONFLICT_PENDING,
__EE_MAY_SET_IN_SYNC,
+
+ /* This epoch entry closes an epoch using a barrier.
+ * On sucessful completion, the epoch is released,
+ * and the P_BARRIER_ACK send. */
__EE_IS_BARRIER,
+
+ /* In case a barrier failed,
+ * we need to resubmit without the barrier flag. */
+ __EE_RESUBMITTED,
+
+ /* we may have several bios per epoch entry.
+ * if any of those fail, we set this flag atomically
+ * from the endio callback */
+ __EE_WAS_ERROR,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
-#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
+#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
+#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
/* global flag bits */
enum {
unsigned int ko_count;
struct drbd_work resync_work,
unplug_work,
- md_sync_work;
+ md_sync_work,
+ delay_probe_work,
+ uuid_work;
struct timer_list resync_timer;
struct timer_list md_sync_timer;
+ struct timer_list delay_probe_timer;
/* Used after attach while negotiating new disk state. */
union drbd_state new_state_tmp;
u64 ed_uuid; /* UUID of the exposed data */
struct mutex state_mutex;
char congestion_reason; /* Why we where congested... */
+ struct list_head delay_probes; /* protected by peer_seq_lock */
+ int data_delay; /* Delay of packets on the data-sock behind meta-sock */
+ unsigned int delay_seq; /* To generate sequence numbers of delay probes */
+ struct timeval dps_time; /* delay-probes-start-time */
+ unsigned int dp_volume_last; /* send_cnt of last delay probe */
+ int c_sync_rate; /* current resync rate after delay_probe magic */
+ atomic_t new_c_uuid;
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
};
+enum dds_flags {
+ DDSF_FORCED = 1,
+ DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+};
+
extern void drbd_init_set_defaults(struct drbd_conf *mdev);
extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
union drbd_state mask, union drbd_state val);
extern int drbd_send_uuids(struct drbd_conf *mdev);
extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val);
-extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply);
+extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
extern int _drbd_send_state(struct drbd_conf *mdev);
extern int drbd_send_state(struct drbd_conf *mdev);
extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
#define APP_R_HSIZE 15
extern int drbd_bm_init(struct drbd_conf *mdev);
-extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors);
+extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
extern void drbd_bm_cleanup(struct drbd_conf *mdev);
extern void drbd_bm_set_all(struct drbd_conf *mdev);
extern void drbd_bm_clear_all(struct drbd_conf *mdev);
extern char *ppsize(char *buf, unsigned long long size);
extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
-extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, int force) __must_hold(local);
+extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
extern void resync_after_online_grow(struct drbd_conf *);
extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
}
-extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
/* worker callbacks */
extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
extern void resync_timer_fn(unsigned long data);
/* drbd_receiver.c */
+extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+ const unsigned rw, const int fault_type);
extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
u64 id,
* inline helper functions
*************************/
+/* see also page_chain_add and friends in drbd_receiver.c */
+static inline struct page *page_chain_next(struct page *page)
+{
+ return (struct page *)page_private(page);
+}
+#define page_chain_for_each(page) \
+ for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
+ page = page_chain_next(page))
+#define page_chain_for_each_safe(page, n) \
+ for (; page && ({ n = page_chain_next(page); 1; }); page = n)
+
+static inline int drbd_bio_has_active_page(struct bio *bio)
+{
+ struct bio_vec *bvec;
+ int i;
+
+ __bio_for_each_segment(bvec, bio, i, 0) {
+ if (page_count(bvec->bv_page) > 1)
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
+{
+ struct page *page = e->pages;
+ page_chain_for_each(page) {
+ if (page_count(page) > 1)
+ return 1;
+ }
+ return 0;
+}
+
+
static inline void drbd_state_lock(struct drbd_conf *mdev)
{
wait_event(mdev->misc_wait,
return 0;
if (test_bit(BITMAP_IO, &mdev->flags))
return 0;
+ if (atomic_read(&mdev->new_c_uuid))
+ return 0;
return 1;
}
/* I'd like to use wait_event_lock_irq,
* but I'm not sure when it got introduced,
* and not sure when it has 3 or 4 arguments */
-static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two)
+static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
{
/* compare with after_state_ch,
* os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */
* to avoid races with the reconnect code,
* we need to atomic_inc within the spinlock. */
+ if (atomic_read(&mdev->new_c_uuid) && atomic_add_unless(&mdev->new_c_uuid, -1, 1))
+ drbd_queue_work_front(&mdev->data.work, &mdev->uuid_work);
+
spin_lock_irq(&mdev->req_lock);
while (!__inc_ap_bio_cond(mdev)) {
prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
finish_wait(&mdev->misc_wait, &wait);
spin_lock_irq(&mdev->req_lock);
}
- atomic_add(one_or_two, &mdev->ap_bio_cnt);
+ atomic_add(count, &mdev->ap_bio_cnt);
spin_unlock_irq(&mdev->req_lock);
}
if (test_bit(MD_NO_BARRIER, &mdev->flags))
return;
- r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL);
+ r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
if (r) {
set_bit(MD_NO_BARRIER, &mdev->flags);
dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
rv = SS_NO_REMOTE_DISK;
+ else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
else if ((ns.conn == C_CONNECTED ||
ns.conn == C_WF_BITMAP_S ||
ns.conn == C_SYNC_SOURCE ||
break;
case C_WF_BITMAP_S:
case C_PAUSED_SYNC_S:
- ns.pdsk = D_OUTDATED;
+ /* remap any consistent state to D_OUTDATED,
+ * but disallow "upgrade" of not even consistent states.
+ */
+ ns.pdsk =
+ (D_DISKLESS < os.pdsk && os.pdsk < D_OUTDATED)
+ ? os.pdsk : D_OUTDATED;
break;
case C_SYNC_SOURCE:
ns.pdsk = D_INCONSISTENT;
&& (ns.pdsk < D_INCONSISTENT ||
ns.pdsk == D_UNKNOWN ||
ns.pdsk == D_OUTDATED)) {
- kfree(mdev->p_uuid);
- mdev->p_uuid = NULL;
if (get_ldev(mdev)) {
if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
- mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
- }
+ mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE &&
+ !atomic_read(&mdev->new_c_uuid))
+ atomic_set(&mdev->new_c_uuid, 2);
put_ldev(mdev);
}
}
if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
- if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
- drbd_uuid_new_current(mdev);
+ /* Diskless peer becomes primary or got connected do diskless, primary peer. */
+ if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0 &&
+ !atomic_read(&mdev->new_c_uuid))
+ atomic_set(&mdev->new_c_uuid, 2);
/* D_DISKLESS Peer becomes secondary */
if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */
mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */
- drbd_send_sizes(mdev, 0); /* to start sync... */
+ drbd_send_sizes(mdev, 0, 0); /* to start sync... */
drbd_send_uuids(mdev);
drbd_send_state(mdev);
}
drbd_md_sync(mdev);
}
+static int w_new_current_uuid(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ if (get_ldev(mdev)) {
+ if (mdev->ldev->md.uuid[UI_BITMAP] == 0) {
+ drbd_uuid_new_current(mdev);
+ if (get_net_conf(mdev)) {
+ drbd_send_uuids(mdev);
+ put_net_conf(mdev);
+ }
+ drbd_md_sync(mdev);
+ }
+ put_ldev(mdev);
+ }
+ atomic_dec(&mdev->new_c_uuid);
+ wake_up(&mdev->misc_wait);
+
+ return 1;
+}
static int drbd_thread_setup(void *arg)
{
(struct p_header *)&p, sizeof(p));
}
-int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply)
+int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
{
struct p_sizes p;
sector_t d_size, u_size;
d_size = drbd_get_max_capacity(mdev->ldev);
u_size = mdev->ldev->dc.disk_size;
q_order_type = drbd_queue_order_type(mdev);
- p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev));
put_ldev(mdev);
} else {
d_size = 0;
p.u_size = cpu_to_be64(u_size);
p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue));
- p.queue_order_type = cpu_to_be32(q_order_type);
+ p.queue_order_type = cpu_to_be16(q_order_type);
+ p.dds_flags = cpu_to_be16(flags);
ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
(struct p_header *)&p, sizeof(p));
return ok;
}
+static int drbd_send_delay_probe(struct drbd_conf *mdev, struct drbd_socket *ds)
+{
+ struct p_delay_probe dp;
+ int offset, ok = 0;
+ struct timeval now;
+
+ mutex_lock(&ds->mutex);
+ if (likely(ds->socket)) {
+ do_gettimeofday(&now);
+ offset = now.tv_usec - mdev->dps_time.tv_usec +
+ (now.tv_sec - mdev->dps_time.tv_sec) * 1000000;
+ dp.seq_num = cpu_to_be32(mdev->delay_seq);
+ dp.offset = cpu_to_be32(offset);
+
+ ok = _drbd_send_cmd(mdev, ds->socket, P_DELAY_PROBE,
+ (struct p_header *)&dp, sizeof(dp), 0);
+ }
+ mutex_unlock(&ds->mutex);
+
+ return ok;
+}
+
+static int drbd_send_delay_probes(struct drbd_conf *mdev)
+{
+ int ok;
+
+ mdev->delay_seq++;
+ do_gettimeofday(&mdev->dps_time);
+ ok = drbd_send_delay_probe(mdev, &mdev->meta);
+ ok = ok && drbd_send_delay_probe(mdev, &mdev->data);
+
+ mdev->dp_volume_last = mdev->send_cnt;
+ mod_timer(&mdev->delay_probe_timer, jiffies + mdev->sync_conf.dp_interval * HZ / 10);
+
+ return ok;
+}
+
/* called on sndtimeo
* returns FALSE if we should retry,
* TRUE if we think connection is dead
return 1;
}
+static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+{
+ struct page *page = e->pages;
+ unsigned len = e->size;
+ page_chain_for_each(page) {
+ unsigned l = min_t(unsigned, len, PAGE_SIZE);
+ if (!_drbd_send_page(mdev, page, 0, l))
+ return 0;
+ len -= l;
+ }
+ return 1;
+}
+
+static void consider_delay_probes(struct drbd_conf *mdev)
+{
+ if (mdev->state.conn != C_SYNC_SOURCE || mdev->agreed_pro_version < 93)
+ return;
+
+ if (mdev->dp_volume_last + mdev->sync_conf.dp_volume * 2 < mdev->send_cnt)
+ drbd_send_delay_probes(mdev);
+}
+
+static int w_delay_probes(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ if (!cancel && mdev->state.conn == C_SYNC_SOURCE)
+ drbd_send_delay_probes(mdev);
+
+ return 1;
+}
+
+static void delay_probe_timer_fn(unsigned long data)
+{
+ struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+ if (list_empty(&mdev->delay_probe_work.list))
+ drbd_queue_work(&mdev->data.work, &mdev->delay_probe_work);
+}
+
/* Used to send write requests
* R_PRIMARY -> Peer (P_DATA)
*/
drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE));
if (ok && dgs) {
dgb = mdev->int_dig_out;
- drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
+ drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
}
if (ok) {
}
drbd_put_data_sock(mdev);
+
+ if (ok)
+ consider_delay_probes(mdev);
+
return ok;
}
sizeof(p), MSG_MORE);
if (ok && dgs) {
dgb = mdev->int_dig_out;
- drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb);
+ drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE);
}
if (ok)
- ok = _drbd_send_zc_bio(mdev, e->private_bio);
+ ok = _drbd_send_zc_ee(mdev, e);
drbd_put_data_sock(mdev);
+
+ if (ok)
+ consider_delay_probes(mdev);
+
return ok;
}
atomic_set(&mdev->net_cnt, 0);
atomic_set(&mdev->packet_seq, 0);
atomic_set(&mdev->pp_in_use, 0);
+ atomic_set(&mdev->new_c_uuid, 0);
mutex_init(&mdev->md_io_mutex);
mutex_init(&mdev->data.mutex);
INIT_LIST_HEAD(&mdev->unplug_work.list);
INIT_LIST_HEAD(&mdev->md_sync_work.list);
INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
+ INIT_LIST_HEAD(&mdev->delay_probes);
+ INIT_LIST_HEAD(&mdev->delay_probe_work.list);
+ INIT_LIST_HEAD(&mdev->uuid_work.list);
+
mdev->resync_work.cb = w_resync_inactive;
mdev->unplug_work.cb = w_send_write_hint;
mdev->md_sync_work.cb = w_md_sync;
mdev->bm_io_work.w.cb = w_bitmap_io;
+ mdev->delay_probe_work.cb = w_delay_probes;
+ mdev->uuid_work.cb = w_new_current_uuid;
init_timer(&mdev->resync_timer);
init_timer(&mdev->md_sync_timer);
+ init_timer(&mdev->delay_probe_timer);
mdev->resync_timer.function = resync_timer_fn;
mdev->resync_timer.data = (unsigned long) mdev;
mdev->md_sync_timer.function = md_sync_timer_fn;
mdev->md_sync_timer.data = (unsigned long) mdev;
+ mdev->delay_probe_timer.function = delay_probe_timer_fn;
+ mdev->delay_probe_timer.data = (unsigned long) mdev;
+
init_waitqueue_head(&mdev->misc_wait);
init_waitqueue_head(&mdev->state_wait);
drbd_set_my_capacity(mdev, 0);
if (mdev->bitmap) {
/* maybe never allocated. */
- drbd_bm_resize(mdev, 0);
+ drbd_bm_resize(mdev, 0, 1);
drbd_bm_cleanup(mdev);
}
if (err)
goto Enomem;
- drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops);
+ drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
if (!drbd_proc) {
printk(KERN_ERR "drbd: unable to register proc file\n");
goto Enomem;
[DRBD_FAULT_DT_RD] = "Data read",
[DRBD_FAULT_DT_RA] = "Data read ahead",
[DRBD_FAULT_BM_ALLOC] = "BM allocation",
- [DRBD_FAULT_AL_EE] = "EE allocation"
+ [DRBD_FAULT_AL_EE] = "EE allocation",
+ [DRBD_FAULT_RECEIVE] = "receive data corruption",
};
return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
* Returns 0 on success, negative return values indicate errors.
* You should call drbd_md_sync() after calling this function.
*/
-enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, int force) __must_hold(local)
+enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
{
sector_t prev_first_sect, prev_size; /* previous meta location */
sector_t la_size;
/* TODO: should only be some assert here, not (re)init... */
drbd_md_set_sector_offsets(mdev, mdev->ldev);
- size = drbd_new_dev_size(mdev, mdev->ldev, force);
+ size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
if (drbd_get_capacity(mdev->this_bdev) != size ||
drbd_bm_capacity(mdev) != size) {
int err;
- err = drbd_bm_resize(mdev, size);
+ err = drbd_bm_resize(mdev, size, !(flags & DDSF_NO_RESYNC));
if (unlikely(err)) {
/* currently there is only one error: ENOMEM! */
size = drbd_bm_capacity(mdev)>>1;
struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
int max_segments = mdev->ldev->dc.max_bio_bvecs;
- if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv)
- max_seg_s = PAGE_SIZE;
-
max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s);
blk_queue_max_hw_sectors(q, max_seg_s >> 9);
}
/* allocation not in the IO path, cqueue thread context */
- new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+ new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
if (!new_conf) {
retcode = ERR_NOMEM;
goto fail;
}
- memset(new_conf, 0, sizeof(struct net_conf));
new_conf->timeout = DRBD_TIMEOUT_DEF;
new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
new_conf->ping_int = DRBD_PING_INT_DEF;
{
struct resize rs;
int retcode = NO_ERROR;
- int ldsc = 0; /* local disk size changed */
enum determine_dev_size dd;
+ enum dds_flags ddsf;
memset(&rs, 0, sizeof(struct resize));
if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
goto fail;
}
- if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
- mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
- ldsc = 1;
+ if (rs.no_resync && mdev->agreed_pro_version < 93) {
+ retcode = ERR_NEED_APV_93;
+ goto fail;
}
+ if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
+ mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
+
mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
- dd = drbd_determin_dev_size(mdev, rs.resize_force);
+ ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+ dd = drbd_determin_dev_size(mdev, ddsf);
drbd_md_sync(mdev);
put_ldev(mdev);
if (dd == dev_size_error) {
goto fail;
}
- if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) {
+ if (mdev->state.conn == C_CONNECTED) {
if (dd == grew)
set_bit(RESIZE_PENDING, &mdev->flags);
drbd_send_uuids(mdev);
- drbd_send_sizes(mdev, 1);
+ drbd_send_sizes(mdev, 1, ddsf);
}
fail:
sc.rate = DRBD_RATE_DEF;
sc.after = DRBD_AFTER_DEF;
sc.al_extents = DRBD_AL_EXTENTS_DEF;
+ sc.dp_volume = DRBD_DP_VOLUME_DEF;
+ sc.dp_interval = DRBD_DP_INTERVAL_DEF;
+ sc.throttle_th = DRBD_RS_THROTTLE_TH_DEF;
+ sc.hold_off_th = DRBD_RS_HOLD_OFF_TH_DEF;
} else
memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
{
struct cn_msg *cn_reply;
struct drbd_nl_cfg_reply *reply;
- struct bio_vec *bvec;
unsigned short *tl;
- int i;
+ struct page *page;
+ unsigned len;
if (!e)
return;
put_unaligned(T_ee_data, tl++);
put_unaligned(e->size, tl++);
- __bio_for_each_segment(bvec, e->private_bio, i, 0) {
- void *d = kmap(bvec->bv_page);
- memcpy(tl, d + bvec->bv_offset, bvec->bv_len);
- kunmap(bvec->bv_page);
- tl=(unsigned short*)((char*)tl + bvec->bv_len);
+ len = e->size;
+ page = e->pages;
+ page_chain_for_each(page) {
+ void *d = kmap_atomic(page, KM_USER0);
+ unsigned l = min_t(unsigned, len, PAGE_SIZE);
+ memcpy(tl, d, l);
+ kunmap_atomic(d, KM_USER0);
+ tl = (unsigned short*)((char*)tl + l);
+ len -= l;
}
put_unaligned(TT_END, tl++); /* Close the tag list */
seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10);
/* if more than 1 GB display in MB */
if (mdev->rs_total > 0x100000L)
- seq_printf(seq, "(%lu/%lu)M\n\t",
+ seq_printf(seq, "(%lu/%lu)M",
(unsigned long) Bit2KB(rs_left >> 10),
(unsigned long) Bit2KB(mdev->rs_total >> 10));
else
- seq_printf(seq, "(%lu/%lu)K\n\t",
+ seq_printf(seq, "(%lu/%lu)K",
(unsigned long) Bit2KB(rs_left),
(unsigned long) Bit2KB(mdev->rs_total));
+ if (mdev->state.conn == C_SYNC_TARGET)
+ seq_printf(seq, " queue_delay: %d.%d ms\n\t",
+ mdev->data_delay / 1000,
+ (mdev->data_delay % 1000) / 100);
+ else if (mdev->state.conn == C_SYNC_SOURCE)
+ seq_printf(seq, " delay_probe: %u\n\t", mdev->delay_seq);
+
/* see drivers/md/md.c
* We do not want to overflow, so the order of operands and
* the * 100 / 100 trick are important. We do a +1 to be
else
seq_printf(seq, " (%ld)", dbdt);
+ if (mdev->state.conn == C_SYNC_TARGET) {
+ if (mdev->c_sync_rate > 1000)
+ seq_printf(seq, " want: %d,%03d",
+ mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
+ else
+ seq_printf(seq, " want: %d", mdev->c_sync_rate);
+ }
+
seq_printf(seq, " K/sec\n");
}
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
-static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev)
+/*
+ * some helper functions to deal with single linked page lists,
+ * page->private being our "next" pointer.
+ */
+
+/* If at least n pages are linked at head, get n pages off.
+ * Otherwise, don't modify head, and return NULL.
+ * Locking is the responsibility of the caller.
+ */
+static struct page *page_chain_del(struct page **head, int n)
+{
+ struct page *page;
+ struct page *tmp;
+
+ BUG_ON(!n);
+ BUG_ON(!head);
+
+ page = *head;
+
+ if (!page)
+ return NULL;
+
+ while (page) {
+ tmp = page_chain_next(page);
+ if (--n == 0)
+ break; /* found sufficient pages */
+ if (tmp == NULL)
+ /* insufficient pages, don't use any of them. */
+ return NULL;
+ page = tmp;
+ }
+
+ /* add end of list marker for the returned list */
+ set_page_private(page, 0);
+ /* actual return value, and adjustment of head */
+ page = *head;
+ *head = tmp;
+ return page;
+}
+
+/* may be used outside of locks to find the tail of a (usually short)
+ * "private" page chain, before adding it back to a global chain head
+ * with page_chain_add() under a spinlock. */
+static struct page *page_chain_tail(struct page *page, int *len)
+{
+ struct page *tmp;
+ int i = 1;
+ while ((tmp = page_chain_next(page)))
+ ++i, page = tmp;
+ if (len)
+ *len = i;
+ return page;
+}
+
+static int page_chain_free(struct page *page)
+{
+ struct page *tmp;
+ int i = 0;
+ page_chain_for_each_safe(page, tmp) {
+ put_page(page);
+ ++i;
+ }
+ return i;
+}
+
+static void page_chain_add(struct page **head,
+ struct page *chain_first, struct page *chain_last)
+{
+#if 1
+ struct page *tmp;
+ tmp = page_chain_tail(chain_first, NULL);
+ BUG_ON(tmp != chain_last);
+#endif
+
+ /* add chain to head */
+ set_page_private(chain_last, (unsigned long)*head);
+ *head = chain_first;
+}
+
+static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
{
struct page *page = NULL;
+ struct page *tmp = NULL;
+ int i = 0;
/* Yes, testing drbd_pp_vacant outside the lock is racy.
* So what. It saves a spin_lock. */
- if (drbd_pp_vacant > 0) {
+ if (drbd_pp_vacant >= number) {
spin_lock(&drbd_pp_lock);
- page = drbd_pp_pool;
- if (page) {
- drbd_pp_pool = (struct page *)page_private(page);
- set_page_private(page, 0); /* just to be polite */
- drbd_pp_vacant--;
- }
+ page = page_chain_del(&drbd_pp_pool, number);
+ if (page)
+ drbd_pp_vacant -= number;
spin_unlock(&drbd_pp_lock);
+ if (page)
+ return page;
}
+
/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- if (!page)
- page = alloc_page(GFP_TRY);
- if (page)
- atomic_inc(&mdev->pp_in_use);
- return page;
+ for (i = 0; i < number; i++) {
+ tmp = alloc_page(GFP_TRY);
+ if (!tmp)
+ break;
+ set_page_private(tmp, (unsigned long)page);
+ page = tmp;
+ }
+
+ if (i == number)
+ return page;
+
+ /* Not enough pages immediately available this time.
+ * No need to jump around here, drbd_pp_alloc will retry this
+ * function "soon". */
+ if (page) {
+ tmp = page_chain_tail(page, NULL);
+ spin_lock(&drbd_pp_lock);
+ page_chain_add(&drbd_pp_pool, page, tmp);
+ drbd_pp_vacant += i;
+ spin_unlock(&drbd_pp_lock);
+ }
+ return NULL;
}
/* kick lower level device, if we have more than (arbitrary number)
list_for_each_safe(le, tle, &mdev->net_ee) {
e = list_entry(le, struct drbd_epoch_entry, w.list);
- if (drbd_bio_has_active_page(e->private_bio))
+ if (drbd_ee_has_active_page(e))
break;
list_move(le, to_be_freed);
}
}
/**
- * drbd_pp_alloc() - Returns a page, fails only if a signal comes in
+ * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
* @mdev: DRBD device.
- * @retry: whether or not to retry allocation forever (or until signalled)
+ * @number: number of pages requested
+ * @retry: whether to retry, if not enough pages are available right now
+ *
+ * Tries to allocate number pages, first from our own page pool, then from
+ * the kernel, unless this allocation would exceed the max_buffers setting.
+ * Possibly retry until DRBD frees sufficient pages somewhere else.
*
- * Tries to allocate a page, first from our own page pool, then from the
- * kernel, unless this allocation would exceed the max_buffers setting.
- * If @retry is non-zero, retry until DRBD frees a page somewhere else.
+ * Returns a page chain linked via page->private.
*/
-static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry)
+static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
{
struct page *page = NULL;
DEFINE_WAIT(wait);
- if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
- page = drbd_pp_first_page_or_try_alloc(mdev);
- if (page)
- return page;
- }
+ /* Yes, we may run up to @number over max_buffers. If we
+ * follow it strictly, the admin will get it wrong anyways. */
+ if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
+ page = drbd_pp_first_pages_or_try_alloc(mdev, number);
- for (;;) {
+ while (page == NULL) {
prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
drbd_kick_lo_and_reclaim_net(mdev);
if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
- page = drbd_pp_first_page_or_try_alloc(mdev);
+ page = drbd_pp_first_pages_or_try_alloc(mdev, number);
if (page)
break;
}
}
finish_wait(&drbd_pp_wait, &wait);
+ if (page)
+ atomic_add(number, &mdev->pp_in_use);
return page;
}
/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
- * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */
+ * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
+ * Either links the page chain back to the global pool,
+ * or returns all pages to the system. */
static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
{
- int free_it;
-
- spin_lock(&drbd_pp_lock);
- if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
- free_it = 1;
- } else {
- set_page_private(page, (unsigned long)drbd_pp_pool);
- drbd_pp_pool = page;
- drbd_pp_vacant++;
- free_it = 0;
- }
- spin_unlock(&drbd_pp_lock);
-
- atomic_dec(&mdev->pp_in_use);
-
- if (free_it)
- __free_page(page);
-
- wake_up(&drbd_pp_wait);
-}
-
-static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio)
-{
- struct page *p_to_be_freed = NULL;
- struct page *page;
- struct bio_vec *bvec;
int i;
-
- spin_lock(&drbd_pp_lock);
- __bio_for_each_segment(bvec, bio, i, 0) {
- if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) {
- set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed);
- p_to_be_freed = bvec->bv_page;
- } else {
- set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool);
- drbd_pp_pool = bvec->bv_page;
- drbd_pp_vacant++;
- }
- }
- spin_unlock(&drbd_pp_lock);
- atomic_sub(bio->bi_vcnt, &mdev->pp_in_use);
-
- while (p_to_be_freed) {
- page = p_to_be_freed;
- p_to_be_freed = (struct page *)page_private(page);
- set_page_private(page, 0); /* just to be polite */
- put_page(page);
+ if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
+ i = page_chain_free(page);
+ else {
+ struct page *tmp;
+ tmp = page_chain_tail(page, &i);
+ spin_lock(&drbd_pp_lock);
+ page_chain_add(&drbd_pp_pool, page, tmp);
+ drbd_pp_vacant += i;
+ spin_unlock(&drbd_pp_lock);
}
-
+ atomic_sub(i, &mdev->pp_in_use);
+ i = atomic_read(&mdev->pp_in_use);
+ if (i < 0)
+ dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
wake_up(&drbd_pp_wait);
}
unsigned int data_size,
gfp_t gfp_mask) __must_hold(local)
{
- struct request_queue *q;
struct drbd_epoch_entry *e;
struct page *page;
- struct bio *bio;
- unsigned int ds;
+ unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE))
return NULL;
return NULL;
}
- bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE));
- if (!bio) {
- if (!(gfp_mask & __GFP_NOWARN))
- dev_err(DEV, "alloc_ee: Allocation of a bio failed\n");
- goto fail1;
- }
-
- bio->bi_bdev = mdev->ldev->backing_bdev;
- bio->bi_sector = sector;
-
- ds = data_size;
- while (ds) {
- page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT));
- if (!page) {
- if (!(gfp_mask & __GFP_NOWARN))
- dev_err(DEV, "alloc_ee: Allocation of a page failed\n");
- goto fail2;
- }
- if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) {
- drbd_pp_free(mdev, page);
- dev_err(DEV, "alloc_ee: bio_add_page(s=%llu,"
- "data_size=%u,ds=%u) failed\n",
- (unsigned long long)sector, data_size, ds);
-
- q = bdev_get_queue(bio->bi_bdev);
- if (q->merge_bvec_fn) {
- struct bvec_merge_data bvm = {
- .bi_bdev = bio->bi_bdev,
- .bi_sector = bio->bi_sector,
- .bi_size = bio->bi_size,
- .bi_rw = bio->bi_rw,
- };
- int l = q->merge_bvec_fn(q, &bvm,
- &bio->bi_io_vec[bio->bi_vcnt]);
- dev_err(DEV, "merge_bvec_fn() = %d\n", l);
- }
-
- /* dump more of the bio. */
- dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs);
- dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt);
- dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size);
- dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments);
-
- goto fail2;
- break;
- }
- ds -= min_t(int, ds, PAGE_SIZE);
- }
-
- D_ASSERT(data_size == bio->bi_size);
-
- bio->bi_private = e;
- e->mdev = mdev;
- e->sector = sector;
- e->size = bio->bi_size;
+ page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
+ if (!page)
+ goto fail;
- e->private_bio = bio;
- e->block_id = id;
INIT_HLIST_NODE(&e->colision);
e->epoch = NULL;
+ e->mdev = mdev;
+ e->pages = page;
+ atomic_set(&e->pending_bios, 0);
+ e->size = data_size;
e->flags = 0;
+ e->sector = sector;
+ e->sector = sector;
+ e->block_id = id;
return e;
- fail2:
- drbd_pp_free_bio_pages(mdev, bio);
- bio_put(bio);
- fail1:
+ fail:
mempool_free(e, drbd_ee_mempool);
-
return NULL;
}
void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
{
- struct bio *bio = e->private_bio;
- drbd_pp_free_bio_pages(mdev, bio);
- bio_put(bio);
+ drbd_pp_free(mdev, e->pages);
+ D_ASSERT(atomic_read(&e->pending_bios) == 0);
D_ASSERT(hlist_unhashed(&e->colision));
mempool_free(e, drbd_ee_mempool);
}
if (!drbd_send_protocol(mdev))
return -1;
drbd_send_sync_param(mdev, &mdev->sync_conf);
- drbd_send_sizes(mdev, 0);
+ drbd_send_sizes(mdev, 0, 0);
drbd_send_uuids(mdev);
drbd_send_state(mdev);
clear_bit(USE_DEGR_WFC_T, &mdev->flags);
int rv;
if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
- rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL);
+ rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
+ NULL, BLKDEV_IFL_WAIT);
if (rv) {
dev_err(DEV, "local disk flush failed with status %d\n", rv);
/* would rather check on EOPNOTSUPP, but that is not reliable.
dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
}
+/**
+ * drbd_submit_ee()
+ * @mdev: DRBD device.
+ * @e: epoch entry
+ * @rw: flag field, see bio->bi_rw
+ */
+/* TODO allocate from our own bio_set. */
+int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+ const unsigned rw, const int fault_type)
+{
+ struct bio *bios = NULL;
+ struct bio *bio;
+ struct page *page = e->pages;
+ sector_t sector = e->sector;
+ unsigned ds = e->size;
+ unsigned n_bios = 0;
+ unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
+
+ if (atomic_read(&mdev->new_c_uuid)) {
+ if (atomic_add_unless(&mdev->new_c_uuid, -1, 1)) {
+ drbd_uuid_new_current(mdev);
+ drbd_md_sync(mdev);
+
+ atomic_dec(&mdev->new_c_uuid);
+ wake_up(&mdev->misc_wait);
+ }
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->new_c_uuid));
+ }
+
+ /* In most cases, we will only need one bio. But in case the lower
+ * level restrictions happen to be different at this offset on this
+ * side than those of the sending peer, we may need to submit the
+ * request in more than one bio. */
+next_bio:
+ bio = bio_alloc(GFP_NOIO, nr_pages);
+ if (!bio) {
+ dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
+ goto fail;
+ }
+ /* > e->sector, unless this is the first bio */
+ bio->bi_sector = sector;
+ bio->bi_bdev = mdev->ldev->backing_bdev;
+ /* we special case some flags in the multi-bio case, see below
+ * (BIO_RW_UNPLUG, BIO_RW_BARRIER) */
+ bio->bi_rw = rw;
+ bio->bi_private = e;
+ bio->bi_end_io = drbd_endio_sec;
+
+ bio->bi_next = bios;
+ bios = bio;
+ ++n_bios;
+
+ page_chain_for_each(page) {
+ unsigned len = min_t(unsigned, ds, PAGE_SIZE);
+ if (!bio_add_page(bio, page, len, 0)) {
+ /* a single page must always be possible! */
+ BUG_ON(bio->bi_vcnt == 0);
+ goto next_bio;
+ }
+ ds -= len;
+ sector += len >> 9;
+ --nr_pages;
+ }
+ D_ASSERT(page == NULL);
+ D_ASSERT(ds == 0);
+
+ atomic_set(&e->pending_bios, n_bios);
+ do {
+ bio = bios;
+ bios = bios->bi_next;
+ bio->bi_next = NULL;
+
+ /* strip off BIO_RW_UNPLUG unless it is the last bio */
+ if (bios)
+ bio->bi_rw &= ~(1<<BIO_RW_UNPLUG);
+
+ drbd_generic_make_request(mdev, fault_type, bio);
+
+ /* strip off BIO_RW_BARRIER,
+ * unless it is the first or last bio */
+ if (bios && bios->bi_next)
+ bios->bi_rw &= ~(1<<BIO_RW_BARRIER);
+ } while (bios);
+ maybe_kick_lo(mdev);
+ return 0;
+
+fail:
+ while (bios) {
+ bio = bios;
+ bios = bios->bi_next;
+ bio_put(bio);
+ }
+ return -ENOMEM;
+}
+
/**
* w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set
* @mdev: DRBD device.
int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local)
{
struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- struct bio *bio = e->private_bio;
-
/* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place,
(and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch)
so that we can finish that epoch in drbd_may_finish_epoch().
if (previous_epoch(mdev, e->epoch))
dev_warn(DEV, "Write ordering was not enforced (one time event)\n");
- /* prepare bio for re-submit,
- * re-init volatile members */
/* we still have a local reference,
* get_ldev was done in receive_Data. */
- bio->bi_bdev = mdev->ldev->backing_bdev;
- bio->bi_sector = e->sector;
- bio->bi_size = e->size;
- bio->bi_idx = 0;
-
- bio->bi_flags &= ~(BIO_POOL_MASK - 1);
- bio->bi_flags |= 1 << BIO_UPTODATE;
-
- /* don't know whether this is necessary: */
- bio->bi_phys_segments = 0;
- bio->bi_next = NULL;
-
- /* these should be unchanged: */
- /* bio->bi_end_io = drbd_endio_write_sec; */
- /* bio->bi_vcnt = whatever; */
e->w.cb = e_end_block;
-
- /* This is no longer a barrier request. */
- bio->bi_rw &= ~(1UL << BIO_RW_BARRIER);
-
- drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio);
-
+ if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_DT_WR) != 0) {
+ /* drbd_submit_ee fails for one reason only:
+ * if was not able to allocate sufficient bios.
+ * requeue, try again later. */
+ e->w.cb = w_e_reissue;
+ drbd_queue_work(&mdev->data.work, &e->w);
+ }
return 1;
}
static struct drbd_epoch_entry *
read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
{
+ const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
struct drbd_epoch_entry *e;
- struct bio_vec *bvec;
struct page *page;
- struct bio *bio;
- int dgs, ds, i, rr;
+ int dgs, ds, rr;
void *dig_in = mdev->int_dig_in;
void *dig_vv = mdev->int_dig_vv;
+ unsigned long *data;
dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
ERR_IF(data_size & 0x1ff) return NULL;
ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL;
+ /* even though we trust out peer,
+ * we sometimes have to double check. */
+ if (sector + (data_size>>9) > capacity) {
+ dev_err(DEV, "capacity: %llus < sector: %llus + size: %u\n",
+ (unsigned long long)capacity,
+ (unsigned long long)sector, data_size);
+ return NULL;
+ }
+
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
if (!e)
return NULL;
- bio = e->private_bio;
+
ds = data_size;
- bio_for_each_segment(bvec, bio, i) {
- page = bvec->bv_page;
- rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE));
+ page = e->pages;
+ page_chain_for_each(page) {
+ unsigned len = min_t(int, ds, PAGE_SIZE);
+ data = kmap(page);
+ rr = drbd_recv(mdev, data, len);
+ if (FAULT_ACTIVE(mdev, DRBD_FAULT_RECEIVE)) {
+ dev_err(DEV, "Fault injection: Corrupting data on receive\n");
+ data[0] = data[0] ^ (unsigned long)-1;
+ }
kunmap(page);
- if (rr != min_t(int, ds, PAGE_SIZE)) {
+ if (rr != len) {
drbd_free_ee(mdev, e);
dev_warn(DEV, "short read receiving data: read %d expected %d\n",
- rr, min_t(int, ds, PAGE_SIZE));
+ rr, len);
return NULL;
}
ds -= rr;
}
if (dgs) {
- drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+ drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
if (memcmp(dig_in, dig_vv, dgs)) {
dev_err(DEV, "Digest integrity check FAILED.\n");
drbd_bcast_ee(mdev, "digest failed",
int rr, rv = 1;
void *data;
- page = drbd_pp_alloc(mdev, 1);
+ if (!data_size)
+ return TRUE;
+
+ page = drbd_pp_alloc(mdev, 1, 1);
data = kmap(page);
while (data_size) {
}
if (dgs) {
- drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+ drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
if (memcmp(dig_in, dig_vv, dgs)) {
dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
return 0;
D_ASSERT(hlist_unhashed(&e->colision));
- if (likely(drbd_bio_uptodate(e->private_bio))) {
+ if (likely((e->flags & EE_WAS_ERROR) == 0)) {
drbd_set_in_sync(mdev, sector, e->size);
ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
} else {
struct drbd_epoch_entry *e;
e = read_in_block(mdev, ID_SYNCER, sector, data_size);
- if (!e) {
- put_ldev(mdev);
- return FALSE;
- }
+ if (!e)
+ goto fail;
dec_rs_pending(mdev);
- e->private_bio->bi_end_io = drbd_endio_write_sec;
- e->private_bio->bi_rw = WRITE;
- e->w.cb = e_end_resync_block;
-
inc_unacked(mdev);
/* corresponding dec_unacked() in e_end_resync_block()
* respective _drbd_clear_done_ee */
+ e->w.cb = e_end_resync_block;
+
spin_lock_irq(&mdev->req_lock);
list_add(&e->w.list, &mdev->sync_ee);
spin_unlock_irq(&mdev->req_lock);
- drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio);
- /* accounting done in endio */
+ if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
+ return TRUE;
- maybe_kick_lo(mdev);
- return TRUE;
+ drbd_free_ee(mdev, e);
+fail:
+ put_ldev(mdev);
+ return FALSE;
}
static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
}
if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
- if (likely(drbd_bio_uptodate(e->private_bio))) {
+ if (likely((e->flags & EE_WAS_ERROR) == 0)) {
pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
mdev->state.conn <= C_PAUSED_SYNC_T &&
e->flags & EE_MAY_SET_IN_SYNC) ?
return FALSE;
}
- e->private_bio->bi_end_io = drbd_endio_write_sec;
e->w.cb = e_end_block;
spin_lock(&mdev->epoch_lock);
drbd_al_begin_io(mdev, e->sector);
}
- e->private_bio->bi_rw = rw;
- drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio);
- /* accounting done in endio */
-
- maybe_kick_lo(mdev);
- return TRUE;
+ if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
+ return TRUE;
out_interrupted:
/* yes, the epoch_size now is imbalanced.
"no local data.\n");
drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
P_NEG_RS_DREPLY , p);
- return TRUE;
+ return drbd_drain_block(mdev, h->length - brps);
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
return FALSE;
}
- e->private_bio->bi_rw = READ;
- e->private_bio->bi_end_io = drbd_endio_read_sec;
-
switch (h->command) {
case P_DATA_REQUEST:
e->w.cb = w_e_end_data_req;
inc_unacked(mdev);
- drbd_generic_make_request(mdev, fault_type, e->private_bio);
- maybe_kick_lo(mdev);
-
- return TRUE;
+ if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
+ return TRUE;
out_free_e:
kfree(di);
hg > 0 ? "source" : "target");
}
+ if (abs(hg) == 100)
+ drbd_khelper(mdev, "initial-split-brain");
+
if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
int pcount = (mdev->state.role == R_PRIMARY)
+ (peer_role == R_PRIMARY);
* after an attempted attach on a diskless node.
* We just refuse to attach -- well, we drop the "connection"
* to that disk, in a way... */
- dev_alert(DEV, "Split-Brain detected, dropping connection!\n");
+ dev_alert(DEV, "Split-Brain detected but unresolved, dropping connection!\n");
drbd_khelper(mdev, "split-brain");
return C_MASK;
}
unsigned int max_seg_s;
sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */
- enum drbd_conns nconn;
+ enum dds_flags ddsf;
ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
if (drbd_recv(mdev, h->payload, h->length) != h->length)
}
#undef min_not_zero
+ ddsf = be16_to_cpu(p->dds_flags);
if (get_ldev(mdev)) {
- dd = drbd_determin_dev_size(mdev, 0);
+ dd = drbd_determin_dev_size(mdev, ddsf);
put_ldev(mdev);
if (dd == dev_size_error)
return FALSE;
drbd_set_my_capacity(mdev, p_size);
}
- if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
- nconn = drbd_sync_handshake(mdev,
- mdev->state.peer, mdev->state.pdsk);
- put_ldev(mdev);
-
- if (nconn == C_MASK) {
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return FALSE;
- }
-
- if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) {
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return FALSE;
- }
- }
-
if (get_ldev(mdev)) {
if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) {
mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
ldsc = 1;
}
- max_seg_s = be32_to_cpu(p->max_segment_size);
+ if (mdev->agreed_pro_version < 94)
+ max_seg_s = be32_to_cpu(p->max_segment_size);
+ else /* drbd 8.3.8 onwards */
+ max_seg_s = DRBD_MAX_SEGMENT_SIZE;
+
if (max_seg_s != queue_max_segment_size(mdev->rq_queue))
drbd_setup_queue_param(mdev, max_seg_s);
- drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type));
+ drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type));
put_ldev(mdev);
}
drbd_get_capacity(mdev->this_bdev) || ldsc) {
/* we have different sizes, probably peer
* needs to know my new size... */
- drbd_send_sizes(mdev, 0);
+ drbd_send_sizes(mdev, 0, ddsf);
}
if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
(dd == grew && mdev->state.conn == C_CONNECTED)) {
if (mdev->state.pdsk >= D_INCONSISTENT &&
- mdev->state.disk >= D_INCONSISTENT)
- resync_after_online_grow(mdev);
- else
+ mdev->state.disk >= D_INCONSISTENT) {
+ if (ddsf & DDSF_NO_RESYNC)
+ dev_info(DEV, "Resync of new storage suppressed with --assume-clean\n");
+ else
+ resync_after_online_grow(mdev);
+ } else
set_bit(RESYNC_AFTER_NEG, &mdev->flags);
}
}
return TRUE;
}
+static void timeval_sub_us(struct timeval* tv, unsigned int us)
+{
+ tv->tv_sec -= us / 1000000;
+ us = us % 1000000;
+ if (tv->tv_usec > us) {
+ tv->tv_usec += 1000000;
+ tv->tv_sec--;
+ }
+ tv->tv_usec -= us;
+}
+
+static void got_delay_probe(struct drbd_conf *mdev, int from, struct p_delay_probe *p)
+{
+ struct delay_probe *dp;
+ struct list_head *le;
+ struct timeval now;
+ int seq_num;
+ int offset;
+ int data_delay;
+
+ seq_num = be32_to_cpu(p->seq_num);
+ offset = be32_to_cpu(p->offset);
+
+ spin_lock(&mdev->peer_seq_lock);
+ if (!list_empty(&mdev->delay_probes)) {
+ if (from == USE_DATA_SOCKET)
+ le = mdev->delay_probes.next;
+ else
+ le = mdev->delay_probes.prev;
+
+ dp = list_entry(le, struct delay_probe, list);
+
+ if (dp->seq_num == seq_num) {
+ list_del(le);
+ spin_unlock(&mdev->peer_seq_lock);
+ do_gettimeofday(&now);
+ timeval_sub_us(&now, offset);
+ data_delay =
+ now.tv_usec - dp->time.tv_usec +
+ (now.tv_sec - dp->time.tv_sec) * 1000000;
+
+ if (data_delay > 0)
+ mdev->data_delay = data_delay;
+
+ kfree(dp);
+ return;
+ }
+
+ if (dp->seq_num > seq_num) {
+ spin_unlock(&mdev->peer_seq_lock);
+ dev_warn(DEV, "Previous allocation failure of struct delay_probe?\n");
+ return; /* Do not alloca a struct delay_probe.... */
+ }
+ }
+ spin_unlock(&mdev->peer_seq_lock);
+
+ dp = kmalloc(sizeof(struct delay_probe), GFP_NOIO);
+ if (!dp) {
+ dev_warn(DEV, "Failed to allocate a struct delay_probe, do not worry.\n");
+ return;
+ }
+
+ dp->seq_num = seq_num;
+ do_gettimeofday(&dp->time);
+ timeval_sub_us(&dp->time, offset);
+
+ spin_lock(&mdev->peer_seq_lock);
+ if (from == USE_DATA_SOCKET)
+ list_add(&dp->list, &mdev->delay_probes);
+ else
+ list_add_tail(&dp->list, &mdev->delay_probes);
+ spin_unlock(&mdev->peer_seq_lock);
+}
+
+static int receive_delay_probe(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_delay_probe *p = (struct p_delay_probe *)h;
+
+ ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
+ if (drbd_recv(mdev, h->payload, h->length) != h->length)
+ return FALSE;
+
+ got_delay_probe(mdev, USE_DATA_SOCKET, p);
+ return TRUE;
+}
+
typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
static drbd_cmd_handler_f drbd_default_handler[] = {
[P_OV_REQUEST] = receive_DataRequest,
[P_OV_REPLY] = receive_DataRequest,
[P_CSUM_RS_REQUEST] = receive_DataRequest,
+ [P_DELAY_PROBE] = receive_delay_probe,
/* anything missing from this table is in
* the asender_tbl, see get_asender_cmd */
[P_MAX_CMD] = NULL,
dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
i = atomic_read(&mdev->pp_in_use);
if (i)
- dev_info(DEV, "pp_in_use = %u, expected 0\n", i);
+ dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
D_ASSERT(list_empty(&mdev->read_ee));
D_ASSERT(list_empty(&mdev->active_ee));
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
- D_ASSERT(p->block_id == ID_SYNCER);
update_peer_seq(mdev, be32_to_cpu(p->seq_num));
return TRUE;
}
+static int got_delay_probe_m(struct drbd_conf *mdev, struct p_header *h)
+{
+ struct p_delay_probe *p = (struct p_delay_probe *)h;
+
+ got_delay_probe(mdev, USE_META_SOCKET, p);
+ return TRUE;
+}
+
struct asender_cmd {
size_t pkt_size;
int (*process)(struct drbd_conf *mdev, struct p_header *h);
[P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
[P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
+ [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_delay_probe_m },
[P_MAX_CMD] = { 0, NULL },
};
if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
struct drbd_request *req;
int local, remote;
int err = -EIO;
+ int ret = 0;
/* allocate outside of all locks; */
req = drbd_req_new(mdev, bio);
(mdev->state.pdsk == D_INCONSISTENT &&
mdev->state.conn >= C_CONNECTED));
- if (!(local || remote)) {
+ if (!(local || remote) && !mdev->state.susp) {
dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
goto fail_free_complete;
}
/* GOOD, everything prepared, grab the spin_lock */
spin_lock_irq(&mdev->req_lock);
+ if (mdev->state.susp) {
+ /* If we got suspended, use the retry mechanism of
+ generic_make_request() to restart processing of this
+ bio. In the next call to drbd_make_request_26
+ we sleep in inc_ap_bio() */
+ ret = 1;
+ spin_unlock_irq(&mdev->req_lock);
+ goto fail_free_complete;
+ }
+
if (remote) {
remote = (mdev->state.pdsk == D_UP_TO_DATE ||
(mdev->state.pdsk == D_INCONSISTENT &&
req->private_bio = NULL;
put_ldev(mdev);
}
- bio_endio(bio, err);
+ if (!ret)
+ bio_endio(bio, err);
+
drbd_req_free(req);
dec_ap_bio(mdev);
kfree(b);
- return 0;
+ return ret;
}
/* helper function for drbd_make_request
*/
static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
{
- /* Unconfigured */
- if (mdev->state.conn == C_DISCONNECTING &&
- mdev->state.disk == D_DISKLESS)
- return 1;
-
if (mdev->state.role != R_PRIMARY &&
(!allow_oos || is_write)) {
if (__ratelimit(&drbd_ratelimit_state)) {
/* we need to get a "reference count" (ap_bio_cnt)
* to avoid races with the disconnect/reconnect/suspend code.
- * In case we need to split the bio here, we need to get two references
+ * In case we need to split the bio here, we need to get three references
* atomically, otherwise we might deadlock when trying to submit the
* second one! */
- inc_ap_bio(mdev, 2);
+ inc_ap_bio(mdev, 3);
D_ASSERT(e_enr == s_enr + 1);
- drbd_make_request_common(mdev, &bp->bio1);
- drbd_make_request_common(mdev, &bp->bio2);
+ while (drbd_make_request_common(mdev, &bp->bio1))
+ inc_ap_bio(mdev, 1);
+
+ while (drbd_make_request_common(mdev, &bp->bio2))
+ inc_ap_bio(mdev, 1);
+
+ dec_ap_bio(mdev);
+
bio_pair_release(bp);
}
return 0;
} else if (limit && get_ldev(mdev)) {
struct request_queue * const b =
mdev->ldev->backing_bdev->bd_disk->queue;
- if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) {
+ if (b->merge_bvec_fn) {
backing_limit = b->merge_bvec_fn(b, bvm, bvec);
limit = min(limit, backing_limit);
}
static const char *drbd_state_sw_errors[] = {
[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
- [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk",
+ [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
/* defined here:
drbd_md_io_complete
- drbd_endio_write_sec
- drbd_endio_read_sec
+ drbd_endio_sec
drbd_endio_pri
* more endio handlers:
/* reads on behalf of the partner,
* "submitted" by the receiver
*/
-void drbd_endio_read_sec(struct bio *bio, int error) __releases(local)
+void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
{
unsigned long flags = 0;
- struct drbd_epoch_entry *e = NULL;
- struct drbd_conf *mdev;
- int uptodate = bio_flagged(bio, BIO_UPTODATE);
-
- e = bio->bi_private;
- mdev = e->mdev;
-
- if (error)
- dev_warn(DEV, "read: error=%d s=%llus\n", error,
- (unsigned long long)e->sector);
- if (!error && !uptodate) {
- dev_warn(DEV, "read: setting error to -EIO s=%llus\n",
- (unsigned long long)e->sector);
- /* strange behavior of some lower level drivers...
- * fail the request by clearing the uptodate flag,
- * but do not return any error?! */
- error = -EIO;
- }
+ struct drbd_conf *mdev = e->mdev;
D_ASSERT(e->block_id != ID_VACANT);
list_del(&e->w.list);
if (list_empty(&mdev->read_ee))
wake_up(&mdev->ee_wait);
+ if (test_bit(__EE_WAS_ERROR, &e->flags))
+ __drbd_chk_io_error(mdev, FALSE);
spin_unlock_irqrestore(&mdev->req_lock, flags);
- drbd_chk_io_error(mdev, error, FALSE);
drbd_queue_work(&mdev->data.work, &e->w);
put_ldev(mdev);
}
+static int is_failed_barrier(int ee_flags)
+{
+ return (ee_flags & (EE_IS_BARRIER|EE_WAS_ERROR|EE_RESUBMITTED))
+ == (EE_IS_BARRIER|EE_WAS_ERROR);
+}
+
/* writes on behalf of the partner, or resync writes,
- * "submitted" by the receiver.
- */
-void drbd_endio_write_sec(struct bio *bio, int error) __releases(local)
+ * "submitted" by the receiver, final stage. */
+static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
{
unsigned long flags = 0;
- struct drbd_epoch_entry *e = NULL;
- struct drbd_conf *mdev;
+ struct drbd_conf *mdev = e->mdev;
sector_t e_sector;
int do_wake;
int is_syncer_req;
int do_al_complete_io;
- int uptodate = bio_flagged(bio, BIO_UPTODATE);
- int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER);
-
- e = bio->bi_private;
- mdev = e->mdev;
- if (error)
- dev_warn(DEV, "write: error=%d s=%llus\n", error,
- (unsigned long long)e->sector);
- if (!error && !uptodate) {
- dev_warn(DEV, "write: setting error to -EIO s=%llus\n",
- (unsigned long long)e->sector);
- /* strange behavior of some lower level drivers...
- * fail the request by clearing the uptodate flag,
- * but do not return any error?! */
- error = -EIO;
- }
-
- /* error == -ENOTSUPP would be a better test,
- * alas it is not reliable */
- if (error && is_barrier && e->flags & EE_IS_BARRIER) {
+ /* if this is a failed barrier request, disable use of barriers,
+ * and schedule for resubmission */
+ if (is_failed_barrier(e->flags)) {
drbd_bump_write_ordering(mdev, WO_bdev_flush);
spin_lock_irqsave(&mdev->req_lock, flags);
list_del(&e->w.list);
+ e->flags = (e->flags & ~EE_WAS_ERROR) | EE_RESUBMITTED;
e->w.cb = w_e_reissue;
/* put_ldev actually happens below, once we come here again. */
__release(local);
D_ASSERT(e->block_id != ID_VACANT);
- spin_lock_irqsave(&mdev->req_lock, flags);
- mdev->writ_cnt += e->size >> 9;
- is_syncer_req = is_syncer_block_id(e->block_id);
-
/* after we moved e to done_ee,
* we may no longer access it,
* it may be freed/reused already!
* (as soon as we release the req_lock) */
e_sector = e->sector;
do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
+ is_syncer_req = is_syncer_block_id(e->block_id);
+ spin_lock_irqsave(&mdev->req_lock, flags);
+ mdev->writ_cnt += e->size >> 9;
list_del(&e->w.list); /* has been on active_ee or sync_ee */
list_add_tail(&e->w.list, &mdev->done_ee);
? list_empty(&mdev->sync_ee)
: list_empty(&mdev->active_ee);
- if (error)
+ if (test_bit(__EE_WAS_ERROR, &e->flags))
__drbd_chk_io_error(mdev, FALSE);
spin_unlock_irqrestore(&mdev->req_lock, flags);
wake_asender(mdev);
put_ldev(mdev);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_endio_sec(struct bio *bio, int error)
+{
+ struct drbd_epoch_entry *e = bio->bi_private;
+ struct drbd_conf *mdev = e->mdev;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+ int is_write = bio_data_dir(bio) == WRITE;
+
+ if (error)
+ dev_warn(DEV, "%s: error=%d s=%llus\n",
+ is_write ? "write" : "read", error,
+ (unsigned long long)e->sector);
+ if (!error && !uptodate) {
+ dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
+ is_write ? "write" : "read",
+ (unsigned long long)e->sector);
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?! */
+ error = -EIO;
+ }
+
+ if (error)
+ set_bit(__EE_WAS_ERROR, &e->flags);
+ bio_put(bio); /* no need for the bio anymore */
+ if (atomic_dec_and_test(&e->pending_bios)) {
+ if (is_write)
+ drbd_endio_write_sec_final(e);
+ else
+ drbd_endio_read_sec_final(e);
+ }
}
/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
return 1; /* Simply ignore this! */
}
-void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
+void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
+{
+ struct hash_desc desc;
+ struct scatterlist sg;
+ struct page *page = e->pages;
+ struct page *tmp;
+ unsigned len;
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ sg_init_table(&sg, 1);
+ crypto_hash_init(&desc);
+
+ while ((tmp = page_chain_next(page))) {
+ /* all but the last page will be fully used */
+ sg_set_page(&sg, page, PAGE_SIZE, 0);
+ crypto_hash_update(&desc, &sg, sg.length);
+ page = tmp;
+ }
+ /* and now the last, possibly only partially used page */
+ len = e->size & (PAGE_SIZE - 1);
+ sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
+ crypto_hash_update(&desc, &sg, sg.length);
+ crypto_hash_final(&desc, digest);
+}
+
+void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest)
{
struct hash_desc desc;
struct scatterlist sg;
return 1;
}
- if (likely(drbd_bio_uptodate(e->private_bio))) {
+ if (likely((e->flags & EE_WAS_ERROR) == 0)) {
digest_size = crypto_hash_digestsize(mdev->csums_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
- drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+ drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
inc_rs_pending(mdev);
ok = drbd_send_drequest_csum(mdev,
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
- if (!e) {
- put_ldev(mdev);
- return 2;
- }
+ if (!e)
+ goto fail;
spin_lock_irq(&mdev->req_lock);
list_add(&e->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->req_lock);
- e->private_bio->bi_end_io = drbd_endio_read_sec;
- e->private_bio->bi_rw = READ;
e->w.cb = w_e_send_csum;
+ if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
+ return 1;
- mdev->read_cnt += size >> 9;
- drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio);
-
- return 1;
+ drbd_free_ee(mdev, e);
+fail:
+ put_ldev(mdev);
+ return 2;
}
void resync_timer_fn(unsigned long data)
drbd_queue_work(&mdev->data.work, &mdev->resync_work);
}
+static int calc_resync_rate(struct drbd_conf *mdev)
+{
+ int d = mdev->data_delay / 1000; /* us -> ms */
+ int td = mdev->sync_conf.throttle_th * 100; /* 0.1s -> ms */
+ int hd = mdev->sync_conf.hold_off_th * 100; /* 0.1s -> ms */
+ int cr = mdev->sync_conf.rate;
+
+ return d <= td ? cr :
+ d >= hd ? 0 :
+ cr + (cr * (td - d) / (hd - td));
+}
+
int w_make_resync_request(struct drbd_conf *mdev,
struct drbd_work *w, int cancel)
{
unsigned long bit;
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
- int max_segment_size = queue_max_segment_size(mdev->rq_queue);
+ int max_segment_size;
int number, i, size, pe, mx;
int align, queued, sndbuf;
return 1;
}
- number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ);
+ /* starting with drbd 8.3.8, we can handle multi-bio EEs,
+ * if it should be necessary */
+ max_segment_size = mdev->agreed_pro_version < 94 ?
+ queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
+
+ mdev->c_sync_rate = calc_resync_rate(mdev);
+ number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
pe = atomic_read(&mdev->rs_pending_cnt);
mutex_lock(&mdev->data.mutex);
*
* Additionally always align bigger requests, in order to
* be prepared for all stripe sizes of software RAIDs.
- *
- * we _do_ care about the agreed-upon q->max_segment_size
- * here, as splitting up the requests on the other side is more
- * difficult. the consequence is, that on lvm and md and other
- * "indirect" devices, this is dead code, since
- * q->max_segment_size will be PAGE_SIZE.
*/
align = 1;
for (;;) {
/* helper */
static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
{
- if (drbd_bio_has_active_page(e->private_bio)) {
+ if (drbd_ee_has_active_page(e)) {
/* This might happen if sendpage() has not finished */
spin_lock_irq(&mdev->req_lock);
list_add_tail(&e->w.list, &mdev->net_ee);
return 1;
}
- if (likely(drbd_bio_uptodate(e->private_bio))) {
+ if (likely((e->flags & EE_WAS_ERROR) == 0)) {
ok = drbd_send_block(mdev, P_DATA_REPLY, e);
} else {
if (__ratelimit(&drbd_ratelimit_state))
put_ldev(mdev);
}
- if (likely(drbd_bio_uptodate(e->private_bio))) {
+ if (likely((e->flags & EE_WAS_ERROR) == 0)) {
if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
inc_rs_pending(mdev);
ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
di = (struct digest_info *)(unsigned long)e->block_id;
- if (likely(drbd_bio_uptodate(e->private_bio))) {
+ if (likely((e->flags & EE_WAS_ERROR) == 0)) {
/* quick hack to try to avoid a race against reconfiguration.
* a real fix would be much more involved,
* introducing more locking mechanisms */
digest = kmalloc(digest_size, GFP_NOIO);
}
if (digest) {
- drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest);
+ drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
eq = !memcmp(digest, di->digest, digest_size);
kfree(digest);
}
if (unlikely(cancel))
goto out;
- if (unlikely(!drbd_bio_uptodate(e->private_bio)))
+ if (unlikely((e->flags & EE_WAS_ERROR) != 0))
goto out;
digest_size = crypto_hash_digestsize(mdev->verify_tfm);
/* FIXME if this allocation fails, online verify will not terminate! */
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
- drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
+ drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
inc_rs_pending(mdev);
ok = drbd_send_drequest_csum(mdev, e->sector, e->size,
digest, digest_size, P_OV_REPLY);
di = (struct digest_info *)(unsigned long)e->block_id;
- if (likely(drbd_bio_uptodate(e->private_bio))) {
+ if (likely((e->flags & EE_WAS_ERROR) == 0)) {
digest_size = crypto_hash_digestsize(mdev->verify_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
- drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest);
+ drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
D_ASSERT(digest_size == di->digest_size);
eq = !memcmp(digest, di->digest, digest_size);
#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE)
-static inline int drbd_bio_has_active_page(struct bio *bio)
-{
- struct bio_vec *bvec;
- int i;
-
- __bio_for_each_segment(bvec, bio, i, 0) {
- if (page_count(bvec->bv_page) > 1)
- return 1;
- }
-
- return 0;
-}
-
/* bi_end_io handlers */
extern void drbd_md_io_complete(struct bio *bio, int error);
-extern void drbd_endio_read_sec(struct bio *bio, int error);
-extern void drbd_endio_write_sec(struct bio *bio, int error);
+extern void drbd_endio_sec(struct bio *bio, int error);
extern void drbd_endio_pri(struct bio *bio, int error);
/*
return 0;
}
-static u64 ide_disk_set_capacity(ide_drive_t *drive, u64 capacity)
+static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
{
- u64 set = min(capacity, drive->probed_capacity);
u16 *id = drive->id;
int lba48 = ata_id_lba48_enabled(id);
if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 ||
ata_id_hpa_enabled(id) == 0)
- goto out;
+ return;
/*
* according to the spec the SET MAX ADDRESS command shall be
* immediately preceded by a READ NATIVE MAX ADDRESS command
*/
- capacity = ide_disk_hpa_get_native_capacity(drive, lba48);
- if (capacity == 0)
- goto out;
-
- set = ide_disk_hpa_set_capacity(drive, set, lba48);
- if (set) {
- /* needed for ->resume to disable HPA */
- drive->dev_flags |= IDE_DFLAG_NOHPA;
- return set;
- }
-out:
- return drive->capacity64;
+ if (!ide_disk_hpa_get_native_capacity(drive, lba48))
+ return;
+
+ if (ide_disk_hpa_set_capacity(drive, drive->probed_capacity, lba48))
+ drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
}
static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
}
const struct ide_disk_ops ide_ata_disk_ops = {
- .check = ide_disk_check,
- .set_capacity = ide_disk_set_capacity,
- .get_capacity = ide_disk_get_capacity,
- .setup = ide_disk_setup,
- .flush = ide_disk_flush,
- .init_media = ide_disk_init_media,
- .set_doorlock = ide_disk_set_doorlock,
- .do_request = ide_do_rw_disk,
- .ioctl = ide_disk_ioctl,
+ .check = ide_disk_check,
+ .unlock_native_capacity = ide_disk_unlock_native_capacity,
+ .get_capacity = ide_disk_get_capacity,
+ .setup = ide_disk_setup,
+ .flush = ide_disk_flush,
+ .init_media = ide_disk_init_media,
+ .set_doorlock = ide_disk_set_doorlock,
+ .do_request = ide_do_rw_disk,
+ .ioctl = ide_disk_ioctl,
};
return ret;
}
-static unsigned long long ide_gd_set_capacity(struct gendisk *disk,
- unsigned long long capacity)
+static void ide_gd_unlock_native_capacity(struct gendisk *disk)
{
struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
ide_drive_t *drive = idkp->drive;
const struct ide_disk_ops *disk_ops = drive->disk_ops;
- if (disk_ops->set_capacity)
- return disk_ops->set_capacity(drive, capacity);
-
- return drive->capacity64;
+ if (disk_ops->unlock_native_capacity)
+ disk_ops->unlock_native_capacity(drive);
}
static int ide_gd_revalidate_disk(struct gendisk *disk)
.locked_ioctl = ide_gd_ioctl,
.getgeo = ide_gd_getgeo,
.media_changed = ide_gd_media_changed,
- .set_capacity = ide_gd_set_capacity,
+ .unlock_native_capacity = ide_gd_unlock_native_capacity,
.revalidate_disk = ide_gd_revalidate_disk
};
*/
mutex_unlock(&bd_inode->i_mutex);
- error = blkdev_issue_flush(bdev, NULL);
+ error = blkdev_issue_flush(bdev, GFP_KERNEL, NULL, BLKDEV_IFL_WAIT);
if (error == -EOPNOTSUPP)
error = 0;
iput(bdev->bd_inode);
}
-int bd_claim(struct block_device *bdev, void *holder)
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @whole: whole block device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Test whther @bdev can be claimed by @holder.
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock).
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
+ void *holder)
{
- int res;
- spin_lock(&bdev_lock);
-
- /* first decide result */
if (bdev->bd_holder == holder)
- res = 0; /* already a holder */
+ return true; /* already a holder */
else if (bdev->bd_holder != NULL)
- res = -EBUSY; /* held by someone else */
+ return false; /* held by someone else */
else if (bdev->bd_contains == bdev)
- res = 0; /* is a whole device which isn't held */
+ return true; /* is a whole device which isn't held */
- else if (bdev->bd_contains->bd_holder == bd_claim)
- res = 0; /* is a partition of a device that is being partitioned */
- else if (bdev->bd_contains->bd_holder != NULL)
- res = -EBUSY; /* is a partition of a held device */
+ else if (whole->bd_holder == bd_claim)
+ return true; /* is a partition of a device that is being partitioned */
+ else if (whole->bd_holder != NULL)
+ return false; /* is a partition of a held device */
else
- res = 0; /* is a partition of an un-held device */
+ return true; /* is a partition of an un-held device */
+}
+
+/**
+ * bd_prepare_to_claim - prepare to claim a block device
+ * @bdev: block device of interest
+ * @whole: the whole device containing @bdev, may equal @bdev
+ * @holder: holder trying to claim @bdev
+ *
+ * Prepare to claim @bdev. This function fails if @bdev is already
+ * claimed by another holder and waits if another claiming is in
+ * progress. This function doesn't actually claim. On successful
+ * return, the caller has ownership of bd_claiming and bd_holder[s].
+ *
+ * CONTEXT:
+ * spin_lock(&bdev_lock). Might release bdev_lock, sleep and regrab
+ * it multiple times.
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+static int bd_prepare_to_claim(struct block_device *bdev,
+ struct block_device *whole, void *holder)
+{
+retry:
+ /* if someone else claimed, fail */
+ if (!bd_may_claim(bdev, whole, holder))
+ return -EBUSY;
+
+ /* if someone else is claiming, wait for it to finish */
+ if (whole->bd_claiming && whole->bd_claiming != holder) {
+ wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock(&bdev_lock);
+ schedule();
+ finish_wait(wq, &wait);
+ spin_lock(&bdev_lock);
+ goto retry;
+ }
+
+ /* yay, all mine */
+ return 0;
+}
+
+/**
+ * bd_start_claiming - start claiming a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ *
+ * @bdev is about to be opened exclusively. Check @bdev can be opened
+ * exclusively and mark that an exclusive open is in progress. Each
+ * successful call to this function must be matched with a call to
+ * either bd_claim() or bd_abort_claiming(). If this function
+ * succeeds, the matching bd_claim() is guaranteed to succeed.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Pointer to the block device containing @bdev on success, ERR_PTR()
+ * value on failure.
+ */
+static struct block_device *bd_start_claiming(struct block_device *bdev,
+ void *holder)
+{
+ struct gendisk *disk;
+ struct block_device *whole;
+ int partno, err;
+
+ might_sleep();
+
+ /*
+ * @bdev might not have been initialized properly yet, look up
+ * and grab the outer block device the hard way.
+ */
+ disk = get_gendisk(bdev->bd_dev, &partno);
+ if (!disk)
+ return ERR_PTR(-ENXIO);
+
+ whole = bdget_disk(disk, 0);
+ put_disk(disk);
+ if (!whole)
+ return ERR_PTR(-ENOMEM);
+
+ /* prepare to claim, if successful, mark claiming in progress */
+ spin_lock(&bdev_lock);
+
+ err = bd_prepare_to_claim(bdev, whole, holder);
+ if (err == 0) {
+ whole->bd_claiming = holder;
+ spin_unlock(&bdev_lock);
+ return whole;
+ } else {
+ spin_unlock(&bdev_lock);
+ bdput(whole);
+ return ERR_PTR(err);
+ }
+}
- /* now impose change */
- if (res==0) {
+/* releases bdev_lock */
+static void __bd_abort_claiming(struct block_device *whole, void *holder)
+{
+ BUG_ON(whole->bd_claiming != holder);
+ whole->bd_claiming = NULL;
+ wake_up_bit(&whole->bd_claiming, 0);
+
+ spin_unlock(&bdev_lock);
+ bdput(whole);
+}
+
+/**
+ * bd_abort_claiming - abort claiming a block device
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Abort a claiming block started by bd_start_claiming(). Note that
+ * @whole is not the block device to be claimed but the whole device
+ * returned by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_abort_claiming(struct block_device *whole, void *holder)
+{
+ spin_lock(&bdev_lock);
+ __bd_abort_claiming(whole, holder); /* releases bdev_lock */
+}
+
+/**
+ * bd_claim - claim a block device
+ * @bdev: block device to claim
+ * @holder: holder trying to claim @bdev
+ *
+ * Try to claim @bdev which must have been opened successfully. This
+ * function may be called with or without preceding
+ * blk_start_claiming(). In the former case, this function is always
+ * successful and terminates the claiming block.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 if successful, -EBUSY if @bdev is already claimed.
+ */
+int bd_claim(struct block_device *bdev, void *holder)
+{
+ struct block_device *whole = bdev->bd_contains;
+ int res;
+
+ might_sleep();
+
+ spin_lock(&bdev_lock);
+
+ res = bd_prepare_to_claim(bdev, whole, holder);
+ if (res == 0) {
/* note that for a whole device bd_holders
* will be incremented twice, and bd_holder will
* be set to bd_claim before being set to holder
*/
- bdev->bd_contains->bd_holders ++;
- bdev->bd_contains->bd_holder = bd_claim;
+ whole->bd_holders++;
+ whole->bd_holder = bd_claim;
bdev->bd_holders++;
bdev->bd_holder = holder;
}
- spin_unlock(&bdev_lock);
+
+ if (whole->bd_claiming)
+ __bd_abort_claiming(whole, holder); /* releases bdev_lock */
+ else
+ spin_unlock(&bdev_lock);
+
return res;
}
-
EXPORT_SYMBOL(bd_claim);
void bd_release(struct block_device *bdev)
static int blkdev_open(struct inode * inode, struct file * filp)
{
+ struct block_device *whole = NULL;
struct block_device *bdev;
int res;
if (bdev == NULL)
return -ENOMEM;
+ if (filp->f_mode & FMODE_EXCL) {
+ whole = bd_start_claiming(bdev, filp);
+ if (IS_ERR(whole)) {
+ bdput(bdev);
+ return PTR_ERR(whole);
+ }
+ }
+
filp->f_mapping = bdev->bd_inode->i_mapping;
res = blkdev_get(bdev, filp->f_mode);
- if (res)
- return res;
- if (filp->f_mode & FMODE_EXCL) {
- res = bd_claim(bdev, filp);
- if (res)
- goto out_blkdev_put;
+ if (whole) {
+ if (res == 0)
+ BUG_ON(bd_claim(bdev, filp) != 0);
+ else
+ bd_abort_claiming(whole, filp);
}
- return 0;
-
- out_blkdev_put:
- blkdev_put(bdev, filp->f_mode);
return res;
}
*/
struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *holder)
{
- struct block_device *bdev;
- int error = 0;
+ struct block_device *bdev, *whole;
+ int error;
bdev = lookup_bdev(path);
if (IS_ERR(bdev))
return bdev;
+ whole = bd_start_claiming(bdev, holder);
+ if (IS_ERR(whole)) {
+ bdput(bdev);
+ return whole;
+ }
+
error = blkdev_get(bdev, mode);
if (error)
- return ERR_PTR(error);
+ goto out_abort_claiming;
+
error = -EACCES;
if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
- goto blkdev_put;
- error = bd_claim(bdev, holder);
- if (error)
- goto blkdev_put;
+ goto out_blkdev_put;
+ BUG_ON(bd_claim(bdev, holder) != 0);
return bdev;
-
-blkdev_put:
+
+out_blkdev_put:
blkdev_put(bdev, mode);
+out_abort_claiming:
+ bd_abort_claiming(whole, holder);
return ERR_PTR(error);
}
u64 start, u64 len)
{
blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
- DISCARD_FL_BARRIER);
+ BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
}
static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
return;
invalidate_bh_lrus();
+ lru_add_drain_all(); /* make sure all lru add caches are flushed */
invalidate_mapping_pages(mapping, 0, -1);
}
EXPORT_SYMBOL(invalidate_bdev);
* storage
*/
if (needs_barrier)
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
return ret;
}
if (ext4_should_writeback_data(inode) &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
+ NULL, BLKDEV_IFL_WAIT);
jbd2_log_wait_commit(journal, commit_tid);
} else if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
return ret;
}
#include <linux/dnotify.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <linux/pipe_fs_i.h>
#include <linux/security.h>
#include <linux/ptrace.h>
#include <linux/signal.h>
case F_NOTIFY:
err = fcntl_dirnotify(fd, filp, arg);
break;
+ case F_SETPIPE_SZ:
+ case F_GETPIPE_SZ:
+ err = pipe_fcntl(filp, cmd, arg);
+ break;
default:
break;
}
int for_kupdate:1;
int range_cyclic:1;
int for_background:1;
+ int sb_pinned:1;
};
/*
}
static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
- struct wb_writeback_args *args)
+ struct wb_writeback_args *args,
+ int wait)
{
struct bdi_work *work;
if (work) {
bdi_work_init(work, args);
bdi_queue_work(bdi, work);
+ if (wait)
+ bdi_wait_on_work_clear(work);
} else {
struct bdi_writeback *wb = &bdi->wb;
.sync_mode = WB_SYNC_ALL,
.nr_pages = LONG_MAX,
.range_cyclic = 0,
+ /*
+ * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
+ * lets make it explicitly clear.
+ */
+ .sb_pinned = 1,
};
struct bdi_work work;
* @bdi: the backing device to write from
* @sb: write inodes from this super_block
* @nr_pages: the number of pages to write
+ * @sb_locked: caller already holds sb umount sem.
*
* Description:
* This does WB_SYNC_NONE opportunistic writeback. The IO is only
* started when this function returns, we make no guarentees on
- * completion. Caller need not hold sb s_umount semaphore.
+ * completion. Caller specifies whether sb umount sem is held already or not.
*
*/
void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
- long nr_pages)
+ long nr_pages, int sb_locked)
{
struct wb_writeback_args args = {
.sb = sb,
.sync_mode = WB_SYNC_NONE,
.nr_pages = nr_pages,
.range_cyclic = 1,
+ .sb_pinned = sb_locked,
};
/*
args.for_background = 1;
}
- bdi_alloc_queue_work(bdi, &args);
+ bdi_alloc_queue_work(bdi, &args, sb_locked);
}
/*
BUG_ON(inode->i_state & I_SYNC);
- /* Set I_SYNC, reset I_DIRTY */
- dirty = inode->i_state & I_DIRTY;
+ /* Set I_SYNC, reset I_DIRTY_PAGES */
inode->i_state |= I_SYNC;
- inode->i_state &= ~I_DIRTY;
-
+ inode->i_state &= ~I_DIRTY_PAGES;
spin_unlock(&inode_lock);
ret = do_writepages(mapping, wbc);
ret = err;
}
+ /*
+ * Some filesystems may redirty the inode during the writeback
+ * due to delalloc, clear dirty metadata flags right before
+ * write_inode()
+ */
+ spin_lock(&inode_lock);
+ dirty = inode->i_state & I_DIRTY;
+ inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+ spin_unlock(&inode_lock);
/* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
int err = write_inode(inode, wbc);
/*
* Caller must already hold the ref for this
*/
- if (wbc->sync_mode == WB_SYNC_ALL) {
+ if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
WARN_ON(!rwsem_is_locked(&sb->s_umount));
return SB_NOT_PINNED;
}
.for_kupdate = args->for_kupdate,
.for_background = args->for_background,
.range_cyclic = args->range_cyclic,
+ .sb_pinned = args->sb_pinned,
};
unsigned long oldest_jif;
long wrote = 0;
unsigned long expired;
long nr_pages;
+ /*
+ * When set to zero, disable periodic writeback
+ */
+ if (!dirty_writeback_interval)
+ return 0;
+
expired = wb->last_old_flush +
msecs_to_jiffies(dirty_writeback_interval * 10);
if (time_before(jiffies, expired))
while ((work = get_next_work_item(bdi, wb)) != NULL) {
struct wb_writeback_args args = work->args;
+ int post_clear;
/*
* Override sync mode, in case we must wait for completion
if (force_wait)
work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
+ post_clear = WB_SYNC_ALL || args.sb_pinned;
+
/*
* If this isn't a data integrity operation, just notify
* that we have seen this work and we are now starting it.
*/
- if (args.sync_mode == WB_SYNC_NONE)
+ if (!post_clear)
wb_clear_pending(wb, work);
wrote += wb_writeback(wb, &args);
* This is a data integrity writeback, so only do the
* notification when we have completed the work.
*/
- if (args.sync_mode == WB_SYNC_ALL)
+ if (post_clear)
wb_clear_pending(wb, work);
}
break;
}
- wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
- schedule_timeout_interruptible(wait_jiffies);
+ if (dirty_writeback_interval) {
+ wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+ schedule_timeout_interruptible(wait_jiffies);
+ } else {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (list_empty_careful(&wb->bdi->work_list) &&
+ !kthread_should_stop())
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ }
+
try_to_freeze();
}
if (!bdi_has_dirty_io(bdi))
continue;
- bdi_alloc_queue_work(bdi, &args);
+ bdi_alloc_queue_work(bdi, &args, 0);
}
rcu_read_unlock();
iput(old_inode);
}
+static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
+{
+ unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+ unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+ long nr_to_write;
+
+ nr_to_write = nr_dirty + nr_unstable +
+ (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+
+ bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
+}
+
/**
* writeback_inodes_sb - writeback dirty inodes from given super_block
* @sb: the superblock
*/
void writeback_inodes_sb(struct super_block *sb)
{
- unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
- unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
- long nr_to_write;
-
- nr_to_write = nr_dirty + nr_unstable +
- (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-
- bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
+ __writeback_inodes_sb(sb, 0);
}
EXPORT_SYMBOL(writeback_inodes_sb);
+/**
+ * writeback_inodes_sb_locked - writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Like writeback_inodes_sb(), except the caller already holds the
+ * sb umount sem.
+ */
+void writeback_inodes_sb_locked(struct super_block *sb)
+{
+ __writeback_inodes_sb(sb, 1);
+}
+
/**
* writeback_inodes_sb_if_idle - start writeback if none underway
* @sb: the superblock
if ((start + nr_sects) != blk) {
rv = blkdev_issue_discard(bdev, start,
nr_sects, GFP_NOFS,
- DISCARD_FL_BARRIER);
+ BLKDEV_IFL_WAIT |
+ BLKDEV_IFL_BARRIER);
if (rv)
goto fail;
nr_sects = 0;
}
if (nr_sects) {
rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
- DISCARD_FL_BARRIER);
+ BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
if (rv)
goto fail;
}
*/
if ((journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(journal->j_fs_dev, NULL);
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
if (!(journal->j_flags & JBD2_ABORT))
jbd2_journal_update_superblock(journal, 1);
return 0;
if (commit_transaction->t_flushed_data_blocks &&
(journal->j_fs_dev != journal->j_dev) &&
(journal->j_flags & JBD2_BARRIER))
- blkdev_issue_flush(journal->j_fs_dev, NULL);
+ blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
/* Done it all: now write the commit record asynchronously. */
if (JBD2_HAS_INCOMPAT_FEATURE(journal,
if (err)
__jbd2_journal_abort_hard(journal);
if (journal->j_flags & JBD2_BARRIER)
- blkdev_issue_flush(journal->j_dev, NULL);
+ blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
}
err = journal_finish_inode_data_buffers(journal, commit_transaction);
start * sects_per_block,
nblocks * sects_per_block,
GFP_NOFS,
- DISCARD_FL_BARRIER);
+ BLKDEV_IFL_BARRIER);
if (ret < 0)
return ret;
nblocks = 0;
ret = blkdev_issue_discard(nilfs->ns_bdev,
start * sects_per_block,
nblocks * sects_per_block,
- GFP_NOFS, DISCARD_FL_BARRIER);
+ GFP_NOFS, BLKDEV_IFL_BARRIER);
return ret;
}
#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
-riscix_partition(struct parsed_partitions *state, struct block_device *bdev,
- unsigned long first_sect, int slot, unsigned long nr_sects)
+static int riscix_partition(struct parsed_partitions *state,
+ unsigned long first_sect, int slot,
+ unsigned long nr_sects)
{
Sector sect;
struct riscix_record *rr;
- rr = (struct riscix_record *)read_dev_sector(bdev, first_sect, §);
+ rr = read_part_sector(state, first_sect, §);
if (!rr)
return -1;
#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \
defined(CONFIG_ACORN_PARTITION_ADFS)
-static int
-linux_partition(struct parsed_partitions *state, struct block_device *bdev,
- unsigned long first_sect, int slot, unsigned long nr_sects)
+static int linux_partition(struct parsed_partitions *state,
+ unsigned long first_sect, int slot,
+ unsigned long nr_sects)
{
Sector sect;
struct linux_part *linuxp;
put_partition(state, slot++, first_sect, size);
- linuxp = (struct linux_part *)read_dev_sector(bdev, first_sect, §);
+ linuxp = read_part_sector(state, first_sect, §);
if (!linuxp)
return -1;
#endif
#ifdef CONFIG_ACORN_PARTITION_CUMANA
-int
-adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_CUMANA(struct parsed_partitions *state)
{
unsigned long first_sector = 0;
unsigned int start_blk = 0;
struct adfs_discrecord *dr;
unsigned int nr_sects;
- data = read_dev_sector(bdev, start_blk * 2 + 6, §);
+ data = read_part_sector(state, start_blk * 2 + 6, §);
if (!data)
return -1;
#ifdef CONFIG_ACORN_PARTITION_RISCIX
case PARTITION_RISCIX_SCSI:
/* RISCiX - we don't know how to find the next one. */
- slot = riscix_partition(state, bdev, first_sector,
- slot, nr_sects);
+ slot = riscix_partition(state, first_sector, slot,
+ nr_sects);
break;
#endif
case PARTITION_LINUX:
- slot = linux_partition(state, bdev, first_sector,
- slot, nr_sects);
+ slot = linux_partition(state, first_sector, slot,
+ nr_sects);
break;
}
put_dev_sector(sect);
* hda1 = ADFS partition on first drive.
* hda2 = non-ADFS partition.
*/
-int
-adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_ADFS(struct parsed_partitions *state)
{
unsigned long start_sect, nr_sects, sectscyl, heads;
Sector sect;
unsigned char id;
int slot = 1;
- data = read_dev_sector(bdev, 6, §);
+ data = read_part_sector(state, 6, §);
if (!data)
return -1;
/*
* Work out start of non-adfs partition.
*/
- nr_sects = (bdev->bd_inode->i_size >> 9) - start_sect;
+ nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
if (start_sect) {
switch (id) {
#ifdef CONFIG_ACORN_PARTITION_RISCIX
case PARTITION_RISCIX_SCSI:
case PARTITION_RISCIX_MFM:
- slot = riscix_partition(state, bdev, start_sect,
- slot, nr_sects);
+ slot = riscix_partition(state, start_sect, slot,
+ nr_sects);
break;
#endif
case PARTITION_LINUX:
- slot = linux_partition(state, bdev, start_sect,
- slot, nr_sects);
+ slot = linux_partition(state, start_sect, slot,
+ nr_sects);
break;
}
}
__le32 size;
};
-static int adfspart_check_ICSLinux(struct block_device *bdev, unsigned long block)
+static int adfspart_check_ICSLinux(struct parsed_partitions *state,
+ unsigned long block)
{
Sector sect;
- unsigned char *data = read_dev_sector(bdev, block, §);
+ unsigned char *data = read_part_sector(state, block, §);
int result = 0;
if (data) {
* hda2 = ADFS partition 1 on first drive.
* ..etc..
*/
-int
-adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_ICS(struct parsed_partitions *state)
{
const unsigned char *data;
const struct ics_part *p;
/*
* Try ICS style partitions - sector 0 contains partition info.
*/
- data = read_dev_sector(bdev, 0, §);
+ data = read_part_sector(state, 0, §);
if (!data)
return -1;
* partition is. We must not make this visible
* to the filesystem.
*/
- if (size > 1 && adfspart_check_ICSLinux(bdev, start)) {
+ if (size > 1 && adfspart_check_ICSLinux(state, start)) {
start += 1;
size -= 1;
}
* hda2 = ADFS partition 1 on first drive.
* ..etc..
*/
-int
-adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_POWERTEC(struct parsed_partitions *state)
{
Sector sect;
const unsigned char *data;
int slot = 1;
int i;
- data = read_dev_sector(bdev, 0, §);
+ data = read_part_sector(state, 0, §);
if (!data)
return -1;
* 1. The individual ADFS boot block entries that are placed on the disk.
* 2. The start address of the next entry.
*/
-int
-adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev)
+int adfspart_check_EESOX(struct parsed_partitions *state)
{
Sector sect;
const unsigned char *data;
sector_t start = 0;
int i, slot = 1;
- data = read_dev_sector(bdev, 7, §);
+ data = read_part_sector(state, 7, §);
if (!data)
return -1;
if (i != 0) {
sector_t size;
- size = get_capacity(bdev->bd_disk);
+ size = get_capacity(state->bdev->bd_disk);
put_partition(state, slot++, start, size - start);
printk("\n");
}
* format, and everyone stick to it?
*/
-int adfspart_check_CUMANA(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_ICS(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_POWERTEC(struct parsed_partitions *state, struct block_device *bdev);
-int adfspart_check_EESOX(struct parsed_partitions *state, struct block_device *bdev);
+int adfspart_check_CUMANA(struct parsed_partitions *state);
+int adfspart_check_ADFS(struct parsed_partitions *state);
+int adfspart_check_ICS(struct parsed_partitions *state);
+int adfspart_check_POWERTEC(struct parsed_partitions *state);
+int adfspart_check_EESOX(struct parsed_partitions *state);
return sum;
}
-int
-amiga_partition(struct parsed_partitions *state, struct block_device *bdev)
+int amiga_partition(struct parsed_partitions *state)
{
Sector sect;
unsigned char *data;
for (blk = 0; ; blk++, put_dev_sector(sect)) {
if (blk == RDB_ALLOCATION_LIMIT)
goto rdb_done;
- data = read_dev_sector(bdev, blk, §);
+ data = read_part_sector(state, blk, §);
if (!data) {
if (warn_no_part)
printk("Dev %s: unable to read RDB block %d\n",
- bdevname(bdev, b), blk);
+ bdevname(state->bdev, b), blk);
res = -1;
goto rdb_done;
}
}
printk("Dev %s: RDB in block %d has bad checksum\n",
- bdevname(bdev, b), blk);
+ bdevname(state->bdev, b), blk);
}
/* blksize is blocks per 512 byte standard block */
put_dev_sector(sect);
for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
blk *= blksize; /* Read in terms partition table understands */
- data = read_dev_sector(bdev, blk, §);
+ data = read_part_sector(state, blk, §);
if (!data) {
if (warn_no_part)
printk("Dev %s: unable to read partition block %d\n",
- bdevname(bdev, b), blk);
+ bdevname(state->bdev, b), blk);
res = -1;
goto rdb_done;
}
* fs/partitions/amiga.h
*/
-int amiga_partition(struct parsed_partitions *state, struct block_device *bdev);
+int amiga_partition(struct parsed_partitions *state);
memcmp (s, "RAW", 3) == 0 ;
}
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev)
+int atari_partition(struct parsed_partitions *state)
{
Sector sect;
struct rootsector *rs;
int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */
#endif
- rs = (struct rootsector *) read_dev_sector(bdev, 0, §);
+ rs = read_part_sector(state, 0, §);
if (!rs)
return -1;
/* Verify this is an Atari rootsector: */
- hd_size = bdev->bd_inode->i_size >> 9;
+ hd_size = state->bdev->bd_inode->i_size >> 9;
if (!VALID_PARTITION(&rs->part[0], hd_size) &&
!VALID_PARTITION(&rs->part[1], hd_size) &&
!VALID_PARTITION(&rs->part[2], hd_size) &&
printk(" XGM<");
partsect = extensect = be32_to_cpu(pi->st);
while (1) {
- xrs = (struct rootsector *)read_dev_sector(bdev, partsect, §2);
+ xrs = read_part_sector(state, partsect, §2);
if (!xrs) {
printk (" block %ld read failed\n", partsect);
put_dev_sector(sect);
u16 checksum; /* checksum for bootable disks */
} __attribute__((__packed__));
-int atari_partition(struct parsed_partitions *state, struct block_device *bdev);
+int atari_partition(struct parsed_partitions *state);
int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
-static int (*check_part[])(struct parsed_partitions *, struct block_device *) = {
+static int (*check_part[])(struct parsed_partitions *) = {
/*
* Probe partition formats with tables at disk address 0
* that also have an ADFS boot block at 0xdc0.
struct parsed_partitions *state;
int i, res, err;
- state = kmalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
+ state = kzalloc(sizeof(struct parsed_partitions), GFP_KERNEL);
if (!state)
return NULL;
+ state->bdev = bdev;
disk_name(hd, 0, state->name);
printk(KERN_INFO " %s:", state->name);
if (isdigit(state->name[strlen(state->name)-1]))
i = res = err = 0;
while (!res && check_part[i]) {
memset(&state->parts, 0, sizeof(state->parts));
- res = check_part[i++](state, bdev);
+ res = check_part[i++](state);
if (res < 0) {
/* We have hit an I/O error which we don't report now.
* But record it, and let the others do their job.
}
if (res > 0)
return state;
+ if (state->access_beyond_eod)
+ err = -ENOSPC;
if (err)
/* The partition is unrecognized. So report I/O errors if there were any */
res = err;
disk_part_iter_exit(&piter);
}
+static bool disk_unlock_native_capacity(struct gendisk *disk)
+{
+ const struct block_device_operations *bdops = disk->fops;
+
+ if (bdops->unlock_native_capacity &&
+ !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
+ printk(KERN_CONT "enabling native capacity\n");
+ bdops->unlock_native_capacity(disk);
+ disk->flags |= GENHD_FL_NATIVE_CAPACITY;
+ return true;
+ } else {
+ printk(KERN_CONT "truncated\n");
+ return false;
+ }
+}
+
int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
{
+ struct parsed_partitions *state = NULL;
struct disk_part_iter piter;
struct hd_struct *part;
- struct parsed_partitions *state;
int p, highest, res;
+rescan:
+ if (state && !IS_ERR(state)) {
+ kfree(state);
+ state = NULL;
+ }
if (bdev->bd_part_count)
return -EBUSY;
bdev->bd_invalidated = 0;
if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))
return 0;
- if (IS_ERR(state)) /* I/O error reading the partition table */
+ if (IS_ERR(state)) {
+ /*
+ * I/O error reading the partition table. If any
+ * partition code tried to read beyond EOD, retry
+ * after unlocking native capacity.
+ */
+ if (PTR_ERR(state) == -ENOSPC) {
+ printk(KERN_WARNING "%s: partition table beyond EOD, ",
+ disk->disk_name);
+ if (disk_unlock_native_capacity(disk))
+ goto rescan;
+ }
return -EIO;
+ }
+ /*
+ * If any partition code tried to read beyond EOD, try
+ * unlocking native capacity even if partition table is
+ * sucessfully read as we could be missing some partitions.
+ */
+ if (state->access_beyond_eod) {
+ printk(KERN_WARNING
+ "%s: partition table partially beyond EOD, ",
+ disk->disk_name);
+ if (disk_unlock_native_capacity(disk))
+ goto rescan;
+ }
/* tell userspace that the media / partition table may have changed */
kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
/* add partitions */
for (p = 1; p < state->limit; p++) {
sector_t size, from;
-try_scan:
+
size = state->parts[p].size;
if (!size)
continue;
from = state->parts[p].from;
if (from >= get_capacity(disk)) {
printk(KERN_WARNING
- "%s: p%d ignored, start %llu is behind the end of the disk\n",
+ "%s: p%d start %llu is beyond EOD, ",
disk->disk_name, p, (unsigned long long) from);
+ if (disk_unlock_native_capacity(disk))
+ goto rescan;
continue;
}
if (from + size > get_capacity(disk)) {
- const struct block_device_operations *bdops = disk->fops;
- unsigned long long capacity;
-
printk(KERN_WARNING
- "%s: p%d size %llu exceeds device capacity, ",
+ "%s: p%d size %llu extends beyond EOD, ",
disk->disk_name, p, (unsigned long long) size);
- if (bdops->set_capacity &&
- (disk->flags & GENHD_FL_NATIVE_CAPACITY) == 0) {
- printk(KERN_CONT "enabling native capacity\n");
- capacity = bdops->set_capacity(disk, ~0ULL);
- disk->flags |= GENHD_FL_NATIVE_CAPACITY;
- if (capacity > get_capacity(disk)) {
- set_capacity(disk, capacity);
- check_disk_size_change(disk, bdev);
- bdev->bd_invalidated = 0;
- }
- goto try_scan;
+ if (disk_unlock_native_capacity(disk)) {
+ /* free state and restart */
+ goto rescan;
} else {
/*
* we can not ignore partitions of broken tables
* we limit them to the end of the disk to avoid
* creating invalid block devices
*/
- printk(KERN_CONT "limited to end of disk\n");
size = get_capacity(disk) - from;
}
}
* description.
*/
struct parsed_partitions {
+ struct block_device *bdev;
char name[BDEVNAME_SIZE];
struct {
sector_t from;
} parts[DISK_MAX_PARTS];
int next;
int limit;
+ bool access_beyond_eod;
};
+static inline void *read_part_sector(struct parsed_partitions *state,
+ sector_t n, Sector *p)
+{
+ if (n >= get_capacity(state->bdev->bd_disk)) {
+ state->access_beyond_eod = true;
+ return NULL;
+ }
+ return read_dev_sector(state->bdev, n, p);
+}
+
static inline void
put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size)
{
* the part[0] entry for this disk, and is the number of
* physical sectors available on the disk.
*/
-static u64
-last_lba(struct block_device *bdev)
+static u64 last_lba(struct block_device *bdev)
{
if (!bdev || !bdev->bd_inode)
return 0;
/**
* read_lba(): Read bytes from disk, starting at given LBA
- * @bdev
+ * @state
* @lba
* @buffer
* @size_t
*
- * Description: Reads @count bytes from @bdev into @buffer.
+ * Description: Reads @count bytes from @state->bdev into @buffer.
* Returns number of bytes read on success, 0 on error.
*/
-static size_t
-read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count)
+static size_t read_lba(struct parsed_partitions *state,
+ u64 lba, u8 *buffer, size_t count)
{
size_t totalreadcount = 0;
+ struct block_device *bdev = state->bdev;
sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
- if (!bdev || !buffer || lba > last_lba(bdev))
+ if (!buffer || lba > last_lba(bdev))
return 0;
while (count) {
int copied = 512;
Sector sect;
- unsigned char *data = read_dev_sector(bdev, n++, §);
+ unsigned char *data = read_part_sector(state, n++, §);
if (!data)
break;
if (copied > count)
/**
* alloc_read_gpt_entries(): reads partition entries from disk
- * @bdev
+ * @state
* @gpt - GPT header
*
* Description: Returns ptes on success, NULL on error.
* Allocates space for PTEs based on information found in @gpt.
* Notes: remember to free pte when you're done!
*/
-static gpt_entry *
-alloc_read_gpt_entries(struct block_device *bdev, gpt_header *gpt)
+static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
+ gpt_header *gpt)
{
size_t count;
gpt_entry *pte;
- if (!bdev || !gpt)
+
+ if (!gpt)
return NULL;
count = le32_to_cpu(gpt->num_partition_entries) *
if (!pte)
return NULL;
- if (read_lba(bdev, le64_to_cpu(gpt->partition_entry_lba),
+ if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
(u8 *) pte,
count) < count) {
kfree(pte);
/**
* alloc_read_gpt_header(): Allocates GPT header, reads into it from disk
- * @bdev
+ * @state
* @lba is the Logical Block Address of the partition table
*
* Description: returns GPT header on success, NULL on error. Allocates
- * and fills a GPT header starting at @ from @bdev.
+ * and fills a GPT header starting at @ from @state->bdev.
* Note: remember to free gpt when finished with it.
*/
-static gpt_header *
-alloc_read_gpt_header(struct block_device *bdev, u64 lba)
+static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
+ u64 lba)
{
gpt_header *gpt;
- unsigned ssz = bdev_logical_block_size(bdev);
-
- if (!bdev)
- return NULL;
+ unsigned ssz = bdev_logical_block_size(state->bdev);
gpt = kzalloc(ssz, GFP_KERNEL);
if (!gpt)
return NULL;
- if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) {
+ if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) {
kfree(gpt);
gpt=NULL;
return NULL;
/**
* is_gpt_valid() - tests one GPT header and PTEs for validity
- * @bdev
+ * @state
* @lba is the logical block address of the GPT header to test
* @gpt is a GPT header ptr, filled on return.
* @ptes is a PTEs ptr, filled on return.
* Description: returns 1 if valid, 0 on error.
* If valid, returns pointers to newly allocated GPT header and PTEs.
*/
-static int
-is_gpt_valid(struct block_device *bdev, u64 lba,
- gpt_header **gpt, gpt_entry **ptes)
+static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
+ gpt_header **gpt, gpt_entry **ptes)
{
u32 crc, origcrc;
u64 lastlba;
- if (!bdev || !gpt || !ptes)
+ if (!ptes)
return 0;
- if (!(*gpt = alloc_read_gpt_header(bdev, lba)))
+ if (!(*gpt = alloc_read_gpt_header(state, lba)))
return 0;
/* Check the GUID Partition Table signature */
/* Check the first_usable_lba and last_usable_lba are
* within the disk.
*/
- lastlba = last_lba(bdev);
+ lastlba = last_lba(state->bdev);
if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
(unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
goto fail;
}
- if (!(*ptes = alloc_read_gpt_entries(bdev, *gpt)))
+ if (!(*ptes = alloc_read_gpt_entries(state, *gpt)))
goto fail;
/* Check the GUID Partition Entry Array CRC */
/**
* find_valid_gpt() - Search disk for valid GPT headers and PTEs
- * @bdev
+ * @state
* @gpt is a GPT header ptr, filled on return.
* @ptes is a PTEs ptr, filled on return.
* Description: Returns 1 if valid, 0 on error.
* This protects against devices which misreport their size, and forces
* the user to decide to use the Alternate GPT.
*/
-static int
-find_valid_gpt(struct block_device *bdev, gpt_header **gpt, gpt_entry **ptes)
+static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
+ gpt_entry **ptes)
{
int good_pgpt = 0, good_agpt = 0, good_pmbr = 0;
gpt_header *pgpt = NULL, *agpt = NULL;
gpt_entry *pptes = NULL, *aptes = NULL;
legacy_mbr *legacymbr;
u64 lastlba;
- if (!bdev || !gpt || !ptes)
+
+ if (!ptes)
return 0;
- lastlba = last_lba(bdev);
+ lastlba = last_lba(state->bdev);
if (!force_gpt) {
/* This will be added to the EFI Spec. per Intel after v1.02. */
legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
if (legacymbr) {
- read_lba(bdev, 0, (u8 *) legacymbr,
- sizeof (*legacymbr));
+ read_lba(state, 0, (u8 *) legacymbr,
+ sizeof (*legacymbr));
good_pmbr = is_pmbr_valid(legacymbr);
kfree(legacymbr);
}
goto fail;
}
- good_pgpt = is_gpt_valid(bdev, GPT_PRIMARY_PARTITION_TABLE_LBA,
+ good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
&pgpt, &pptes);
if (good_pgpt)
- good_agpt = is_gpt_valid(bdev,
+ good_agpt = is_gpt_valid(state,
le64_to_cpu(pgpt->alternate_lba),
&agpt, &aptes);
if (!good_agpt && force_gpt)
- good_agpt = is_gpt_valid(bdev, lastlba,
- &agpt, &aptes);
+ good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
/* The obviously unsuccessful case */
if (!good_pgpt && !good_agpt)
}
/**
- * efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+ * efi_partition(struct parsed_partitions *state)
* @state
- * @bdev
*
* Description: called from check.c, if the disk contains GPT
* partitions, sets up partition entries in the kernel.
* 1 if successful
*
*/
-int
-efi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int efi_partition(struct parsed_partitions *state)
{
gpt_header *gpt = NULL;
gpt_entry *ptes = NULL;
u32 i;
- unsigned ssz = bdev_logical_block_size(bdev) / 512;
+ unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
- if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) {
+ if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
kfree(ptes);
return 0;
u64 size = le64_to_cpu(ptes[i].ending_lba) -
le64_to_cpu(ptes[i].starting_lba) + 1ULL;
- if (!is_pte_valid(&ptes[i], last_lba(bdev)))
+ if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
continue;
put_partition(state, i+1, start * ssz, size * ssz);
} __attribute__ ((packed)) legacy_mbr;
/* Functions */
-extern int efi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int efi_partition(struct parsed_partitions *state);
#endif
/*
*/
-int
-ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ibm_partition(struct parsed_partitions *state)
{
+ struct block_device *bdev = state->bdev;
int blocksize, res;
loff_t i_size, offset, size, fmt_size;
dasd_information2_t *info;
/*
* Get volume label, extract name and type.
*/
- data = read_dev_sector(bdev, info->label_block*(blocksize/512), §);
+ data = read_part_sector(state, info->label_block*(blocksize/512),
+ §);
if (data == NULL)
goto out_readerr;
*/
blk = cchhb2blk(&label->vol.vtoc, geo) + 1;
counter = 0;
- data = read_dev_sector(bdev, blk * (blocksize/512),
- §);
+ data = read_part_sector(state, blk * (blocksize/512),
+ §);
while (data != NULL) {
struct vtoc_format1_label f1;
|| f1.DS1FMTID == _ascebc['7']
|| f1.DS1FMTID == _ascebc['9']) {
blk++;
- data = read_dev_sector(bdev, blk *
- (blocksize/512),
- §);
+ data = read_part_sector(state,
+ blk * (blocksize/512), §);
continue;
}
size * (blocksize >> 9));
counter++;
blk++;
- data = read_dev_sector(bdev,
- blk * (blocksize/512),
- §);
+ data = read_part_sector(state,
+ blk * (blocksize/512), §);
}
if (!data)
-int ibm_partition(struct parsed_partitions *, struct block_device *);
+int ibm_partition(struct parsed_partitions *);
#include "check.h"
#include "karma.h"
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev)
+int karma_partition(struct parsed_partitions *state)
{
int i;
int slot = 1;
} __attribute__((packed)) *label;
struct d_partition *p;
- data = read_dev_sector(bdev, 0, §);
+ data = read_part_sector(state, 0, §);
if (!data)
return -1;
#define KARMA_LABEL_MAGIC 0xAB56
-int karma_partition(struct parsed_partitions *state, struct block_device *bdev);
+int karma_partition(struct parsed_partitions *state);
/**
* ldm_validate_privheads - Compare the primary privhead with its backups
- * @bdev: Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
* @ph1: Memory struct to fill with ph contents
*
* Read and compare all three privheads from disk.
* Return: 'true' Success
* 'false' Error
*/
-static bool ldm_validate_privheads (struct block_device *bdev,
- struct privhead *ph1)
+static bool ldm_validate_privheads(struct parsed_partitions *state,
+ struct privhead *ph1)
{
static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 };
struct privhead *ph[3] = { ph1 };
long num_sects;
int i;
- BUG_ON (!bdev || !ph1);
+ BUG_ON (!state || !ph1);
ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL);
ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL);
/* Read and parse privheads */
for (i = 0; i < 3; i++) {
- data = read_dev_sector (bdev,
- ph[0]->config_start + off[i], §);
+ data = read_part_sector(state, ph[0]->config_start + off[i],
+ §);
if (!data) {
ldm_crit ("Disk read failed.");
goto out;
}
}
- num_sects = bdev->bd_inode->i_size >> 9;
+ num_sects = state->bdev->bd_inode->i_size >> 9;
if ((ph[0]->config_start > num_sects) ||
((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
/**
* ldm_validate_tocblocks - Validate the table of contents and its backups
- * @bdev: Device holding the LDM Database
- * @base: Offset, into @bdev, of the database
+ * @state: Partition check state including device holding the LDM Database
+ * @base: Offset, into @state->bdev, of the database
* @ldb: Cache of the database structures
*
* Find and compare the four tables of contents of the LDM Database stored on
- * @bdev and return the parsed information into @toc1.
+ * @state->bdev and return the parsed information into @toc1.
*
* The offsets and sizes of the configs are range-checked against a privhead.
*
* Return: 'true' @toc1 contains validated TOCBLOCK info
* 'false' @toc1 contents are undefined
*/
-static bool ldm_validate_tocblocks(struct block_device *bdev,
- unsigned long base, struct ldmdb *ldb)
+static bool ldm_validate_tocblocks(struct parsed_partitions *state,
+ unsigned long base, struct ldmdb *ldb)
{
static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4};
struct tocblock *tb[4];
int i, nr_tbs;
bool result = false;
- BUG_ON(!bdev || !ldb);
+ BUG_ON(!state || !ldb);
ph = &ldb->ph;
tb[0] = &ldb->toc;
tb[1] = kmalloc(sizeof(*tb[1]) * 3, GFP_KERNEL);
* skip any that fail as long as we get at least one valid TOCBLOCK.
*/
for (nr_tbs = i = 0; i < 4; i++) {
- data = read_dev_sector(bdev, base + off[i], §);
+ data = read_part_sector(state, base + off[i], §);
if (!data) {
ldm_error("Disk read failed for TOCBLOCK %d.", i);
continue;
/**
* ldm_validate_vmdb - Read the VMDB and validate it
- * @bdev: Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
* @base: Offset, into @bdev, of the database
* @ldb: Cache of the database structures
*
* Return: 'true' @ldb contains validated VBDB info
* 'false' @ldb contents are undefined
*/
-static bool ldm_validate_vmdb (struct block_device *bdev, unsigned long base,
- struct ldmdb *ldb)
+static bool ldm_validate_vmdb(struct parsed_partitions *state,
+ unsigned long base, struct ldmdb *ldb)
{
Sector sect;
u8 *data;
struct vmdb *vm;
struct tocblock *toc;
- BUG_ON (!bdev || !ldb);
+ BUG_ON (!state || !ldb);
vm = &ldb->vm;
toc = &ldb->toc;
- data = read_dev_sector (bdev, base + OFF_VMDB, §);
+ data = read_part_sector(state, base + OFF_VMDB, §);
if (!data) {
ldm_crit ("Disk read failed.");
return false;
/**
* ldm_validate_partition_table - Determine whether bdev might be a dynamic disk
- * @bdev: Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
*
* This function provides a weak test to decide whether the device is a dynamic
* disk or not. It looks for an MS-DOS-style partition table containing at
* least one partition of type 0x42 (formerly SFS, now used by Windows for
* dynamic disks).
*
- * N.B. The only possible error can come from the read_dev_sector and that is
+ * N.B. The only possible error can come from the read_part_sector and that is
* only likely to happen if the underlying device is strange. If that IS
* the case we should return zero to let someone else try.
*
- * Return: 'true' @bdev is a dynamic disk
- * 'false' @bdev is not a dynamic disk, or an error occurred
+ * Return: 'true' @state->bdev is a dynamic disk
+ * 'false' @state->bdev is not a dynamic disk, or an error occurred
*/
-static bool ldm_validate_partition_table (struct block_device *bdev)
+static bool ldm_validate_partition_table(struct parsed_partitions *state)
{
Sector sect;
u8 *data;
int i;
bool result = false;
- BUG_ON (!bdev);
+ BUG_ON(!state);
- data = read_dev_sector (bdev, 0, §);
+ data = read_part_sector(state, 0, §);
if (!data) {
ldm_crit ("Disk read failed.");
return false;
/**
* ldm_get_vblks - Read the on-disk database of VBLKs into memory
- * @bdev: Device holding the LDM Database
- * @base: Offset, into @bdev, of the database
+ * @state: Partition check state including device holding the LDM Database
+ * @base: Offset, into @state->bdev, of the database
* @ldb: Cache of the database structures
*
* To use the information from the VBLKs, they need to be read from the disk,
* Return: 'true' All the VBLKs were read successfully
* 'false' An error occurred
*/
-static bool ldm_get_vblks (struct block_device *bdev, unsigned long base,
- struct ldmdb *ldb)
+static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base,
+ struct ldmdb *ldb)
{
int size, perbuf, skip, finish, s, v, recs;
u8 *data = NULL;
bool result = false;
LIST_HEAD (frags);
- BUG_ON (!bdev || !ldb);
+ BUG_ON(!state || !ldb);
size = ldb->vm.vblk_size;
perbuf = 512 / size;
finish = (size * ldb->vm.last_vblk_seq) >> 9;
for (s = skip; s < finish; s++) { /* For each sector */
- data = read_dev_sector (bdev, base + OFF_VMDB + s, §);
+ data = read_part_sector(state, base + OFF_VMDB + s, §);
if (!data) {
ldm_crit ("Disk read failed.");
goto out;
/**
* ldm_partition - Find out whether a device is a dynamic disk and handle it
- * @pp: List of the partitions parsed so far
- * @bdev: Device holding the LDM Database
+ * @state: Partition check state including device holding the LDM Database
*
* This determines whether the device @bdev is a dynamic disk and if so creates
* the partitions necessary in the gendisk structure pointed to by @hd.
* example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
* and so on: the actual data containing partitions.
*
- * Return: 1 Success, @bdev is a dynamic disk and we handled it
- * 0 Success, @bdev is not a dynamic disk
+ * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
+ * 0 Success, @state->bdev is not a dynamic disk
* -1 An error occurred before enough information had been read
- * Or @bdev is a dynamic disk, but it may be corrupted
+ * Or @state->bdev is a dynamic disk, but it may be corrupted
*/
-int ldm_partition (struct parsed_partitions *pp, struct block_device *bdev)
+int ldm_partition(struct parsed_partitions *state)
{
struct ldmdb *ldb;
unsigned long base;
int result = -1;
- BUG_ON (!pp || !bdev);
+ BUG_ON(!state);
/* Look for signs of a Dynamic Disk */
- if (!ldm_validate_partition_table (bdev))
+ if (!ldm_validate_partition_table(state))
return 0;
ldb = kmalloc (sizeof (*ldb), GFP_KERNEL);
}
/* Parse and check privheads. */
- if (!ldm_validate_privheads (bdev, &ldb->ph))
+ if (!ldm_validate_privheads(state, &ldb->ph))
goto out; /* Already logged */
/* All further references are relative to base (database start). */
base = ldb->ph.config_start;
/* Parse and check tocs and vmdb. */
- if (!ldm_validate_tocblocks (bdev, base, ldb) ||
- !ldm_validate_vmdb (bdev, base, ldb))
+ if (!ldm_validate_tocblocks(state, base, ldb) ||
+ !ldm_validate_vmdb(state, base, ldb))
goto out; /* Already logged */
/* Initialize vblk lists in ldmdb struct */
INIT_LIST_HEAD (&ldb->v_comp);
INIT_LIST_HEAD (&ldb->v_part);
- if (!ldm_get_vblks (bdev, base, ldb)) {
+ if (!ldm_get_vblks(state, base, ldb)) {
ldm_crit ("Failed to read the VBLKs from the database.");
goto cleanup;
}
/* Finally, create the data partition devices. */
- if (ldm_create_data_partitions (pp, ldb)) {
+ if (ldm_create_data_partitions(state, ldb)) {
ldm_debug ("Parsed LDM database successfully.");
result = 1;
}
struct list_head v_part;
};
-int ldm_partition (struct parsed_partitions *state, struct block_device *bdev);
+int ldm_partition(struct parsed_partitions *state);
#endif /* _FS_PT_LDM_H_ */
stg[i] = 0;
}
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev)
+int mac_partition(struct parsed_partitions *state)
{
int slot = 1;
Sector sect;
struct mac_driver_desc *md;
/* Get 0th block and look at the first partition map entry. */
- md = (struct mac_driver_desc *) read_dev_sector(bdev, 0, §);
+ md = read_part_sector(state, 0, §);
if (!md)
return -1;
if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) {
}
secsize = be16_to_cpu(md->block_size);
put_dev_sector(sect);
- data = read_dev_sector(bdev, secsize/512, §);
+ data = read_part_sector(state, secsize/512, §);
if (!data)
return -1;
part = (struct mac_partition *) (data + secsize%512);
for (blk = 1; blk <= blocks_in_map; ++blk) {
int pos = blk * secsize;
put_dev_sector(sect);
- data = read_dev_sector(bdev, pos/512, §);
+ data = read_part_sector(state, pos/512, §);
if (!data)
return -1;
part = (struct mac_partition *) (data + pos%512);
}
#ifdef CONFIG_PPC_PMAC
if (found_root_goodness)
- note_bootable_part(bdev->bd_dev, found_root, found_root_goodness);
+ note_bootable_part(state->bdev->bd_dev, found_root,
+ found_root_goodness);
#endif
put_dev_sector(sect);
/* ... more stuff */
};
-int mac_partition(struct parsed_partitions *state, struct block_device *bdev);
+int mac_partition(struct parsed_partitions *state);
#define AIX_LABEL_MAGIC2 0xC2
#define AIX_LABEL_MAGIC3 0xD4
#define AIX_LABEL_MAGIC4 0xC1
-static int aix_magic_present(unsigned char *p, struct block_device *bdev)
+static int aix_magic_present(struct parsed_partitions *state, unsigned char *p)
{
struct partition *pt = (struct partition *) (p + 0x1be);
Sector sect;
is_extended_partition(pt))
return 0;
}
- d = read_dev_sector(bdev, 7, §);
+ d = read_part_sector(state, 7, §);
if (d) {
if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M')
ret = 1;
* only for the actual data partitions.
*/
-static void
-parse_extended(struct parsed_partitions *state, struct block_device *bdev,
- sector_t first_sector, sector_t first_size)
+static void parse_extended(struct parsed_partitions *state,
+ sector_t first_sector, sector_t first_size)
{
struct partition *p;
Sector sect;
unsigned char *data;
sector_t this_sector, this_size;
- sector_t sector_size = bdev_logical_block_size(bdev) / 512;
+ sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
int loopct = 0; /* number of links followed
without finding a data partition */
int i;
return;
if (state->next == state->limit)
return;
- data = read_dev_sector(bdev, this_sector, §);
+ data = read_part_sector(state, this_sector, §);
if (!data)
return;
/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also
indicates linux swap. Be careful before believing this is Solaris. */
-static void
-parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_solaris_x86(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_SOLARIS_X86_PARTITION
Sector sect;
int i;
short max_nparts;
- v = (struct solaris_x86_vtoc *)read_dev_sector(bdev, offset+1, §);
+ v = read_part_sector(state, offset + 1, §);
if (!v)
return;
if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) {
* Create devices for BSD partitions listed in a disklabel, under a
* dos-like partition. See parse_extended() for more information.
*/
-static void
-parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin, char *flavour,
- int max_partitions)
+static void parse_bsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin, char *flavour,
+ int max_partitions)
{
Sector sect;
struct bsd_disklabel *l;
struct bsd_partition *p;
- l = (struct bsd_disklabel *)read_dev_sector(bdev, offset+1, §);
+ l = read_part_sector(state, offset + 1, §);
if (!l)
return;
if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) {
}
#endif
-static void
-parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_freebsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
- parse_bsd(state, bdev, offset, size, origin,
- "bsd", BSD_MAXPARTITIONS);
+ parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS);
#endif
}
-static void
-parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_netbsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
- parse_bsd(state, bdev, offset, size, origin,
- "netbsd", BSD_MAXPARTITIONS);
+ parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS);
#endif
}
-static void
-parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_openbsd(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_BSD_DISKLABEL
- parse_bsd(state, bdev, offset, size, origin,
- "openbsd", OPENBSD_MAXPARTITIONS);
+ parse_bsd(state, offset, size, origin, "openbsd",
+ OPENBSD_MAXPARTITIONS);
#endif
}
* Create devices for Unixware partitions listed in a disklabel, under a
* dos-like partition. See parse_extended() for more information.
*/
-static void
-parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_unixware(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_UNIXWARE_DISKLABEL
Sector sect;
struct unixware_disklabel *l;
struct unixware_slice *p;
- l = (struct unixware_disklabel *)read_dev_sector(bdev, offset+29, §);
+ l = read_part_sector(state, offset + 29, §);
if (!l)
return;
if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC ||
* Anand Krishnamurthy <anandk@wiproge.med.ge.com>
* Rajeev V. Pillai <rajeevvp@yahoo.com>
*/
-static void
-parse_minix(struct parsed_partitions *state, struct block_device *bdev,
- sector_t offset, sector_t size, int origin)
+static void parse_minix(struct parsed_partitions *state,
+ sector_t offset, sector_t size, int origin)
{
#ifdef CONFIG_MINIX_SUBPARTITION
Sector sect;
struct partition *p;
int i;
- data = read_dev_sector(bdev, offset, §);
+ data = read_part_sector(state, offset, §);
if (!data)
return;
static struct {
unsigned char id;
- void (*parse)(struct parsed_partitions *, struct block_device *,
- sector_t, sector_t, int);
+ void (*parse)(struct parsed_partitions *, sector_t, sector_t, int);
} subtypes[] = {
{FREEBSD_PARTITION, parse_freebsd},
{NETBSD_PARTITION, parse_netbsd},
{0, NULL},
};
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
+int msdos_partition(struct parsed_partitions *state)
{
- sector_t sector_size = bdev_logical_block_size(bdev) / 512;
+ sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
Sector sect;
unsigned char *data;
struct partition *p;
struct fat_boot_sector *fb;
int slot;
- data = read_dev_sector(bdev, 0, §);
+ data = read_part_sector(state, 0, §);
if (!data)
return -1;
if (!msdos_magic_present(data + 510)) {
return 0;
}
- if (aix_magic_present(data, bdev)) {
+ if (aix_magic_present(state, data)) {
put_dev_sector(sect);
printk( " [AIX]");
return 0;
put_partition(state, slot, start, n);
printk(" <");
- parse_extended(state, bdev, start, size);
+ parse_extended(state, start, size);
printk(" >");
continue;
}
if (!subtypes[n].parse)
continue;
- subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
- nr_sects(p)*sector_size, slot);
+ subtypes[n].parse(state, start_sect(p) * sector_size,
+ nr_sects(p) * sector_size, slot);
}
put_dev_sector(sect);
return 1;
#define MSDOS_LABEL_MAGIC 0xAA55
-int msdos_partition(struct parsed_partitions *state, struct block_device *bdev);
+int msdos_partition(struct parsed_partitions *state);
#include "check.h"
#include "osf.h"
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev)
+int osf_partition(struct parsed_partitions *state)
{
int i;
int slot = 1;
} * label;
struct d_partition * partition;
- data = read_dev_sector(bdev, 0, §);
+ data = read_part_sector(state, 0, §);
if (!data)
return -1;
#define DISKLABELMAGIC (0x82564557UL)
-int osf_partition(struct parsed_partitions *state, struct block_device *bdev);
+int osf_partition(struct parsed_partitions *state);
__be32 _unused1; /* Padding */
};
-int sgi_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sgi_partition(struct parsed_partitions *state)
{
int i, csum;
__be32 magic;
struct sgi_partition *p;
char b[BDEVNAME_SIZE];
- label = (struct sgi_disklabel *) read_dev_sector(bdev, 0, §);
+ label = read_part_sector(state, 0, §);
if (!label)
return -1;
p = &label->partitions[0];
}
if(csum) {
printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
- bdevname(bdev, b));
+ bdevname(state->bdev, b));
put_dev_sector(sect);
return 0;
}
* fs/partitions/sgi.h
*/
-extern int sgi_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sgi_partition(struct parsed_partitions *state);
#define SGI_LABEL_MAGIC 0x0be5a941
#include "check.h"
#include "sun.h"
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sun_partition(struct parsed_partitions *state)
{
int i;
__be16 csum;
int use_vtoc;
int nparts;
- label = (struct sun_disklabel *)read_dev_sector(bdev, 0, §);
+ label = read_part_sector(state, 0, §);
if (!label)
return -1;
csum ^= *ush--;
if (csum) {
printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
- bdevname(bdev, b));
+ bdevname(state->bdev, b));
put_dev_sector(sect);
return 0;
}
#define SUN_LABEL_MAGIC 0xDABE
#define SUN_VTOC_SANITY 0x600DDEEE
-int sun_partition(struct parsed_partitions *state, struct block_device *bdev);
+int sun_partition(struct parsed_partitions *state);
};
-int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev)
+int sysv68_partition(struct parsed_partitions *state)
{
int i, slices;
int slot = 1;
struct dkblk0 *b;
struct slice *slice;
- data = read_dev_sector(bdev, 0, §);
+ data = read_part_sector(state, 0, §);
if (!data)
return -1;
i = be32_to_cpu(b->dk_ios.ios_slcblk);
put_dev_sector(sect);
- data = read_dev_sector(bdev, i, §);
+ data = read_part_sector(state, i, §);
if (!data)
return -1;
-extern int sysv68_partition(struct parsed_partitions *state, struct block_device *bdev);
+extern int sysv68_partition(struct parsed_partitions *state);
#include "check.h"
#include "ultrix.h"
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev)
+int ultrix_partition(struct parsed_partitions *state)
{
int i;
Sector sect;
#define PT_MAGIC 0x032957 /* Partition magic number */
#define PT_VALID 1 /* Indicates if struct is valid */
- data = read_dev_sector(bdev, (16384 - sizeof(*label))/512, §);
+ data = read_part_sector(state, (16384 - sizeof(*label))/512, §);
if (!data)
return -1;
* fs/partitions/ultrix.h
*/
-int ultrix_partition(struct parsed_partitions *state, struct block_device *bdev);
+int ultrix_partition(struct parsed_partitions *state);
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/log2.h>
#include <linux/mount.h>
#include <linux/pipe_fs_i.h>
#include <linux/uio.h>
#include <linux/pagemap.h>
#include <linux/audit.h>
#include <linux/syscalls.h>
+#include <linux/fcntl.h>
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+/*
+ * The max size that a non-root user is allowed to grow the pipe. Can
+ * be set by root in /proc/sys/fs/pipe-max-pages
+ */
+unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+
/*
* We use a start+len construction, which provides full use of the
* allocated memory.
if (!buf->len) {
buf->ops = NULL;
ops->release(pipe, buf);
- curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
+ curbuf = (curbuf + 1) & (pipe->buffers - 1);
pipe->curbuf = curbuf;
pipe->nrbufs = --bufs;
do_wakeup = 1;
chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
if (pipe->nrbufs && chars != 0) {
int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
- (PIPE_BUFFERS-1);
+ (pipe->buffers - 1);
struct pipe_buffer *buf = pipe->bufs + lastbuf;
const struct pipe_buf_operations *ops = buf->ops;
int offset = buf->offset + buf->len;
break;
}
bufs = pipe->nrbufs;
- if (bufs < PIPE_BUFFERS) {
- int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+ if (bufs < pipe->buffers) {
+ int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
struct pipe_buffer *buf = pipe->bufs + newbuf;
struct page *page = pipe->tmp_page;
char *src;
if (!total_len)
break;
}
- if (bufs < PIPE_BUFFERS)
+ if (bufs < pipe->buffers)
continue;
if (filp->f_flags & O_NONBLOCK) {
if (!ret)
nrbufs = pipe->nrbufs;
while (--nrbufs >= 0) {
count += pipe->bufs[buf].len;
- buf = (buf+1) & (PIPE_BUFFERS-1);
+ buf = (buf+1) & (pipe->buffers - 1);
}
mutex_unlock(&inode->i_mutex);
}
if (filp->f_mode & FMODE_WRITE) {
- mask |= (nrbufs < PIPE_BUFFERS) ? POLLOUT | POLLWRNORM : 0;
+ mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
/*
* Most Unices do not set POLLERR for FIFOs but on Linux they
* behave exactly like pipes for poll().
pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
if (pipe) {
- init_waitqueue_head(&pipe->wait);
- pipe->r_counter = pipe->w_counter = 1;
- pipe->inode = inode;
+ pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+ if (pipe->bufs) {
+ init_waitqueue_head(&pipe->wait);
+ pipe->r_counter = pipe->w_counter = 1;
+ pipe->inode = inode;
+ pipe->buffers = PIPE_DEF_BUFFERS;
+ return pipe;
+ }
+ kfree(pipe);
}
- return pipe;
+ return NULL;
}
void __free_pipe_info(struct pipe_inode_info *pipe)
{
int i;
- for (i = 0; i < PIPE_BUFFERS; i++) {
+ for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops)
buf->ops->release(pipe, buf);
}
if (pipe->tmp_page)
__free_page(pipe->tmp_page);
+ kfree(pipe->bufs);
kfree(pipe);
}
return sys_pipe2(fildes, 0);
}
+/*
+ * Allocate a new array of pipe buffers and copy the info over. Returns the
+ * pipe size if successful, or return -ERROR on error.
+ */
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+{
+ struct pipe_buffer *bufs;
+
+ /*
+ * Must be a power-of-2 currently
+ */
+ if (!is_power_of_2(arg))
+ return -EINVAL;
+
+ /*
+ * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
+ * expect a lot of shrink+grow operations, just free and allocate
+ * again like we would do for growing. If the pipe currently
+ * contains more buffers than arg, then return busy.
+ */
+ if (arg < pipe->nrbufs)
+ return -EBUSY;
+
+ bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+ if (unlikely(!bufs))
+ return -ENOMEM;
+
+ /*
+ * The pipe array wraps around, so just start the new one at zero
+ * and adjust the indexes.
+ */
+ if (pipe->nrbufs) {
+ const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
+ const unsigned int head = pipe->nrbufs - tail;
+
+ if (head)
+ memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
+ if (tail)
+ memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+ }
+
+ pipe->curbuf = 0;
+ kfree(pipe->bufs);
+ pipe->bufs = bufs;
+ pipe->buffers = arg;
+ return arg;
+}
+
+long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct pipe_inode_info *pipe;
+ long ret;
+
+ pipe = file->f_path.dentry->d_inode->i_pipe;
+ if (!pipe)
+ return -EBADF;
+
+ mutex_lock(&pipe->inode->i_mutex);
+
+ switch (cmd) {
+ case F_SETPIPE_SZ:
+ if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
+ return -EINVAL;
+ /*
+ * The pipe needs to be at least 2 pages large to
+ * guarantee POSIX behaviour.
+ */
+ if (arg < 2)
+ return -EINVAL;
+ ret = pipe_set_size(pipe, arg);
+ break;
+ case F_GETPIPE_SZ:
+ ret = pipe->buffers;
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_unlock(&pipe->inode->i_mutex);
+ return ret;
+}
+
/*
* pipefs should _never_ be mounted by userland - too much of security hassle,
* no real gain from having the whole whorehouse mounted. So we don't need
barrier_done = reiserfs_commit_for_inode(inode);
reiserfs_write_unlock(inode->i_sb);
if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
- blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+ blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
if (barrier_done < 0)
return barrier_done;
return (err < 0) ? -EIO : 0;
break;
}
- if (pipe->nrbufs < PIPE_BUFFERS) {
- int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+ if (pipe->nrbufs < pipe->buffers) {
+ int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
struct pipe_buffer *buf = pipe->bufs + newbuf;
buf->page = spd->pages[page_nr];
if (!--spd->nr_pages)
break;
- if (pipe->nrbufs < PIPE_BUFFERS)
+ if (pipe->nrbufs < pipe->buffers)
continue;
break;
page_cache_release(spd->pages[i]);
}
+/*
+ * Check if we need to grow the arrays holding pages and partial page
+ * descriptions.
+ */
+int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
+{
+ if (pipe->buffers <= PIPE_DEF_BUFFERS)
+ return 0;
+
+ spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL);
+ spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL);
+
+ if (spd->pages && spd->partial)
+ return 0;
+
+ kfree(spd->pages);
+ kfree(spd->partial);
+ return -ENOMEM;
+}
+
+void splice_shrink_spd(struct pipe_inode_info *pipe,
+ struct splice_pipe_desc *spd)
+{
+ if (pipe->buffers <= PIPE_DEF_BUFFERS)
+ return;
+
+ kfree(spd->pages);
+ kfree(spd->partial);
+}
+
static int
__generic_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
{
struct address_space *mapping = in->f_mapping;
unsigned int loff, nr_pages, req_pages;
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
struct page *page;
pgoff_t index, end_index;
loff_t isize;
.spd_release = spd_release_page,
};
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
index = *ppos >> PAGE_CACHE_SHIFT;
loff = *ppos & ~PAGE_CACHE_MASK;
req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS);
+ nr_pages = min(req_pages, pipe->buffers);
/*
* Lookup the (hopefully) full range of pages we need.
*/
- spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
+ spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
index += spd.nr_pages;
/*
unlock_page(page);
}
- pages[spd.nr_pages++] = page;
+ spd.pages[spd.nr_pages++] = page;
index++;
}
* this_len is the max we'll use from this page
*/
this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
- page = pages[page_nr];
+ page = spd.pages[page_nr];
if (PageReadahead(page))
page_cache_async_readahead(mapping, &in->f_ra, in,
error = -ENOMEM;
break;
}
- page_cache_release(pages[page_nr]);
- pages[page_nr] = page;
+ page_cache_release(spd.pages[page_nr]);
+ spd.pages[page_nr] = page;
}
/*
* page was already under io and is now done, great
len = this_len;
}
- partial[page_nr].offset = loff;
- partial[page_nr].len = this_len;
+ spd.partial[page_nr].offset = loff;
+ spd.partial[page_nr].len = this_len;
len -= this_len;
loff = 0;
spd.nr_pages++;
* we got, 'nr_pages' is how many pages are in the map.
*/
while (page_nr < nr_pages)
- page_cache_release(pages[page_nr++]);
+ page_cache_release(spd.pages[page_nr++]);
in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
if (spd.nr_pages)
- return splice_to_pipe(pipe, &spd);
+ error = splice_to_pipe(pipe, &spd);
+ splice_shrink_spd(pipe, &spd);
return error;
}
unsigned int nr_pages;
unsigned int nr_freed;
size_t offset;
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
- struct iovec vec[PIPE_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
+ struct iovec *vec, __vec[PIPE_DEF_BUFFERS];
pgoff_t index;
ssize_t res;
size_t this_len;
.spd_release = spd_release_page,
};
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ res = -ENOMEM;
+ vec = __vec;
+ if (pipe->buffers > PIPE_DEF_BUFFERS) {
+ vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL);
+ if (!vec)
+ goto shrink_ret;
+ }
+
index = *ppos >> PAGE_CACHE_SHIFT;
offset = *ppos & ~PAGE_CACHE_MASK;
nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+ for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) {
struct page *page;
page = alloc_page(GFP_USER);
this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
vec[i].iov_base = (void __user *) page_address(page);
vec[i].iov_len = this_len;
- pages[i] = page;
+ spd.pages[i] = page;
spd.nr_pages++;
len -= this_len;
offset = 0;
nr_freed = 0;
for (i = 0; i < spd.nr_pages; i++) {
this_len = min_t(size_t, vec[i].iov_len, res);
- partial[i].offset = 0;
- partial[i].len = this_len;
+ spd.partial[i].offset = 0;
+ spd.partial[i].len = this_len;
if (!this_len) {
- __free_page(pages[i]);
- pages[i] = NULL;
+ __free_page(spd.pages[i]);
+ spd.pages[i] = NULL;
nr_freed++;
}
res -= this_len;
if (res > 0)
*ppos += res;
+shrink_ret:
+ if (vec != __vec)
+ kfree(vec);
+ splice_shrink_spd(pipe, &spd);
return res;
err:
for (i = 0; i < spd.nr_pages; i++)
- __free_page(pages[i]);
+ __free_page(spd.pages[i]);
- return error;
+ res = error;
+ goto shrink_ret;
}
EXPORT_SYMBOL(default_file_splice_read);
if (!buf->len) {
buf->ops = NULL;
ops->release(pipe, buf);
- pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+ pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--;
if (pipe->inode)
sd->need_wakeup = true;
* If we did an incomplete transfer we must release
* the pipe buffers in question:
*/
- for (i = 0; i < PIPE_BUFFERS; i++) {
+ for (i = 0; i < pipe->buffers; i++) {
struct pipe_buffer *buf = pipe->bufs + i;
if (buf->ops) {
*/
static int get_iovec_page_array(const struct iovec __user *iov,
unsigned int nr_vecs, struct page **pages,
- struct partial_page *partial, int aligned)
+ struct partial_page *partial, int aligned,
+ unsigned int pipe_buffers)
{
int buffers = 0, error = 0;
break;
npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- if (npages > PIPE_BUFFERS - buffers)
- npages = PIPE_BUFFERS - buffers;
+ if (npages > pipe_buffers - buffers)
+ npages = pipe_buffers - buffers;
error = get_user_pages_fast((unsigned long)base, npages,
0, &pages[buffers]);
* or if we mapped the max number of pages that we have
* room for.
*/
- if (error < npages || buffers == PIPE_BUFFERS)
+ if (error < npages || buffers == pipe_buffers)
break;
nr_vecs--;
unsigned long nr_segs, unsigned int flags)
{
struct pipe_inode_info *pipe;
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
.ops = &user_page_pipe_buf_ops,
.spd_release = spd_release_page,
};
+ long ret;
pipe = pipe_info(file->f_path.dentry->d_inode);
if (!pipe)
return -EBADF;
- spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
- flags & SPLICE_F_GIFT);
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
+ spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages,
+ spd.partial, flags & SPLICE_F_GIFT,
+ pipe->buffers);
if (spd.nr_pages <= 0)
- return spd.nr_pages;
+ ret = spd.nr_pages;
+ else
+ ret = splice_to_pipe(pipe, &spd);
- return splice_to_pipe(pipe, &spd);
+ splice_shrink_spd(pipe, &spd);
+ return ret;
}
/*
* Check ->nrbufs without the inode lock first. This function
* is speculative anyways, so missing one is ok.
*/
- if (pipe->nrbufs < PIPE_BUFFERS)
+ if (pipe->nrbufs < pipe->buffers)
return 0;
ret = 0;
pipe_lock(pipe);
- while (pipe->nrbufs >= PIPE_BUFFERS) {
+ while (pipe->nrbufs >= pipe->buffers) {
if (!pipe->readers) {
send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
* Cannot make any progress, because either the input
* pipe is empty or the output pipe is full.
*/
- if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+ if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
/* Already processed some buffers, break */
if (ret)
break;
}
ibuf = ipipe->bufs + ipipe->curbuf;
- nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+ nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
obuf = opipe->bufs + nbuf;
if (len >= ibuf->len) {
*obuf = *ibuf;
ibuf->ops = NULL;
opipe->nrbufs++;
- ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+ ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
ipipe->nrbufs--;
input_wakeup = true;
} else {
* If we have iterated all input buffers or ran out of
* output room, break.
*/
- if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
+ if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
break;
- ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
- nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+ ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
+ nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
/*
* Get a reference to this pipe buffer,
if (wait)
sync_inodes_sb(sb);
else
- writeback_inodes_sb(sb);
+ writeback_inodes_sb_locked(sb);
if (sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, wait);
xfs_blkdev_issue_flush(
xfs_buftarg_t *buftarg)
{
- blkdev_issue_flush(buftarg->bt_bdev, NULL);
+ blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL,
+ BLKDEV_IFL_WAIT);
}
STATIC void
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/sched.h>
+#include <linux/timer.h>
#include <linux/writeback.h>
#include <asm/atomic.h>
struct device *dev;
+ struct timer_list laptop_mode_wb_timer;
+
#ifdef CONFIG_DEBUG_FS
struct dentry *debug_dir;
struct dentry *debug_stats;
void bdi_unregister(struct backing_dev_info *bdi);
int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
- long nr_pages);
+ long nr_pages, int sb_locked);
int bdi_writeback_task(struct bdi_writeback *wb);
int bdi_has_dirty_io(struct backing_dev_info *bdi);
+void bdi_arm_supers_timer(void);
extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
};
/*
- * two pointers are available for the IO schedulers, if they need
+ * Three pointers are available for the IO schedulers, if they need
* more they have to dynamically allocate it.
*/
void *elevator_private;
void *elevator_private2;
+ void *elevator_private3;
struct gendisk *rq_disk;
unsigned long start_time;
-
+#ifdef CONFIG_BLK_CGROUP
+ unsigned long long start_time_ns;
+ unsigned long long io_start_time_ns; /* when passed to hardware */
+#endif
/* Number of scatter-gather DMA addr+len pairs after
* physical address coalescing is performed.
*/
*/
extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
spinlock_t *lock, int node_id);
+extern struct request_queue *blk_init_allocated_queue_node(struct request_queue *,
+ request_fn_proc *,
+ spinlock_t *, int node_id);
extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
+extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
+ request_fn_proc *, spinlock_t *);
extern void blk_cleanup_queue(struct request_queue *);
extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
extern void blk_queue_bounce_limit(struct request_queue *, u64);
return NULL;
return bqt->tag_index[tag];
}
-
-extern int blkdev_issue_flush(struct block_device *, sector_t *);
-#define DISCARD_FL_WAIT 0x01 /* wait for completion */
-#define DISCARD_FL_BARRIER 0x02 /* issue DISCARD_BARRIER request */
-extern int blkdev_issue_discard(struct block_device *, sector_t sector,
- sector_t nr_sects, gfp_t, int flags);
-
+enum{
+ BLKDEV_WAIT, /* wait for completion */
+ BLKDEV_BARRIER, /*issue request with barrier */
+};
+#define BLKDEV_IFL_WAIT (1 << BLKDEV_WAIT)
+#define BLKDEV_IFL_BARRIER (1 << BLKDEV_BARRIER)
+extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
+ unsigned long);
+extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
+extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
static inline int sb_issue_discard(struct super_block *sb,
sector_t block, sector_t nr_blocks)
{
block <<= (sb->s_blocksize_bits - 9);
nr_blocks <<= (sb->s_blocksize_bits - 9);
return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_KERNEL,
- DISCARD_FL_BARRIER);
+ BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
}
extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
struct work_struct;
int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
+#ifdef CONFIG_BLK_CGROUP
+static inline void set_start_time_ns(struct request *req)
+{
+ req->start_time_ns = sched_clock();
+}
+
+static inline void set_io_start_time_ns(struct request *req)
+{
+ req->io_start_time_ns = sched_clock();
+}
+
+static inline uint64_t rq_start_time_ns(struct request *req)
+{
+ return req->start_time_ns;
+}
+
+static inline uint64_t rq_io_start_time_ns(struct request *req)
+{
+ return req->io_start_time_ns;
+}
+#else
+static inline void set_start_time_ns(struct request *req) {}
+static inline void set_io_start_time_ns(struct request *req) {}
+static inline uint64_t rq_start_time_ns(struct request *req)
+{
+ return 0;
+}
+static inline uint64_t rq_io_start_time_ns(struct request *req)
+{
+ return 0;
+}
+#endif
+
#define MODULE_ALIAS_BLOCKDEV(major,minor) \
MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
#define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
int (*direct_access) (struct block_device *, sector_t,
void **, unsigned long *);
int (*media_changed) (struct gendisk *);
- unsigned long long (*set_capacity) (struct gendisk *,
- unsigned long long);
+ void (*unlock_native_capacity) (struct gendisk *);
int (*revalidate_disk) (struct gendisk *);
int (*getgeo)(struct block_device *, struct hd_geometry *);
struct module *owner;
extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.7"
+#define REL_VERSION "8.3.8rc1"
#define API_VERSION 88
#define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 92
+#define PRO_VERSION_MAX 94
enum drbd_io_error_p {
ERR_DATA_NOT_CURRENT = 150,
ERR_CONNECTED = 151, /* DRBD 8.3 only */
ERR_PERM = 152,
+ ERR_NEED_APV_93 = 153,
/* insert new ones above this line */
AFTER_LAST_ERR_CODE
#define DRBD_MAX_BIO_BVECS_MAX 128
#define DRBD_MAX_BIO_BVECS_DEF 0
+#define DRBD_DP_VOLUME_MIN 4
+#define DRBD_DP_VOLUME_MAX 1048576
+#define DRBD_DP_VOLUME_DEF 16384
+
+#define DRBD_DP_INTERVAL_MIN 1
+#define DRBD_DP_INTERVAL_MAX 600
+#define DRBD_DP_INTERVAL_DEF 5
+
+#define DRBD_RS_THROTTLE_TH_MIN 1
+#define DRBD_RS_THROTTLE_TH_MAX 600
+#define DRBD_RS_THROTTLE_TH_DEF 20
+
+#define DRBD_RS_HOLD_OFF_TH_MIN 1
+#define DRBD_RS_HOLD_OFF_TH_MAX 6000
+#define DRBD_RS_HOLD_OFF_TH_DEF 100
+
#undef RANGE
#endif
NL_PACKET(resize, 7,
NL_INT64( 29, T_MAY_IGNORE, resize_size)
NL_BIT( 68, T_MAY_IGNORE, resize_force)
+ NL_BIT( 69, T_MANDATORY, no_resync)
)
NL_PACKET(syncer_conf, 8,
NL_INTEGER( 30, T_MAY_IGNORE, rate)
NL_INTEGER( 31, T_MAY_IGNORE, after)
NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
+ NL_INTEGER( 71, T_MAY_IGNORE, dp_volume)
+ NL_INTEGER( 72, T_MAY_IGNORE, dp_interval)
+ NL_INTEGER( 73, T_MAY_IGNORE, throttle_th)
+ NL_INTEGER( 74, T_MAY_IGNORE, hold_off_th)
NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
typedef int (elevator_allow_merge_fn) (struct request_queue *, struct request *, struct bio *);
+typedef void (elevator_bio_merged_fn) (struct request_queue *,
+ struct request *, struct bio *);
+
typedef int (elevator_dispatch_fn) (struct request_queue *, int);
typedef void (elevator_add_req_fn) (struct request_queue *, struct request *);
elevator_merged_fn *elevator_merged_fn;
elevator_merge_req_fn *elevator_merge_req_fn;
elevator_allow_merge_fn *elevator_allow_merge_fn;
+ elevator_bio_merged_fn *elevator_bio_merged_fn;
elevator_dispatch_fn *elevator_dispatch_fn;
elevator_add_req_fn *elevator_add_req_fn;
extern void elv_merge_requests(struct request_queue *, struct request *,
struct request *);
extern void elv_merged_request(struct request_queue *, struct request *, int);
+extern void elv_bio_merged(struct request_queue *q, struct request *,
+ struct bio *);
extern void elv_requeue_request(struct request_queue *, struct request *);
extern int elv_queue_empty(struct request_queue *);
extern struct request *elv_former_request(struct request_queue *, struct request *);
*/
#define F_NOTIFY (F_LINUX_SPECIFIC_BASE+2)
+/*
+ * Set and get of pipe page size array
+ */
+#define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
+#define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
+
/*
* Types of directory notifications that may be requested.
*/
int bd_openers;
struct mutex bd_mutex; /* open/close mutex */
struct list_head bd_inodes;
+ void * bd_claiming;
void * bd_holder;
int bd_holders;
#ifdef CONFIG_SYSFS
struct ide_disk_ops {
int (*check)(struct ide_drive_s *, const char *);
int (*get_capacity)(struct ide_drive_s *);
- u64 (*set_capacity)(struct ide_drive_s *, u64);
+ void (*unlock_native_capacity)(struct ide_drive_s *);
void (*setup)(struct ide_drive_s *);
void (*flush)(struct ide_drive_s *);
int (*init_media)(struct ide_drive_s *, struct gendisk *);
#define PIPEFS_MAGIC 0x50495045
-#define PIPE_BUFFERS (16)
+#define PIPE_DEF_BUFFERS 16
#define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */
#define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */
**/
struct pipe_inode_info {
wait_queue_head_t wait;
- unsigned int nrbufs, curbuf;
- struct page *tmp_page;
+ unsigned int nrbufs, curbuf, buffers;
unsigned int readers;
unsigned int writers;
unsigned int waiting_writers;
unsigned int r_counter;
unsigned int w_counter;
+ struct page *tmp_page;
struct fasync_struct *fasync_readers;
struct fasync_struct *fasync_writers;
struct inode *inode;
- struct pipe_buffer bufs[PIPE_BUFFERS];
+ struct pipe_buffer *bufs;
};
/*
void pipe_unlock(struct pipe_inode_info *);
void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *);
+extern unsigned int pipe_max_pages;
+
/* Drop the inode semaphore and wait for a pipe event, atomically */
void pipe_wait(struct pipe_inode_info *pipe);
int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
+/* for F_SETPIPE_SZ and F_GETPIPE_SZ */
+long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
+
#endif
extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *,
splice_direct_actor *);
+/*
+ * for dynamic pipe sizing
+ */
+extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *);
+extern void splice_shrink_spd(struct pipe_inode_info *,
+ struct splice_pipe_desc *);
+
#endif
* so we use a single control to update them
*/
unsigned no_nrwrite_index_update:1;
+
+ /*
+ * For WB_SYNC_ALL, the sb must always be pinned. For WB_SYNC_NONE,
+ * the writeback code will pin the sb for the caller. However,
+ * for eg umount, the caller does WB_SYNC_NONE but already has
+ * the sb pinned. If the below is set, caller already has the
+ * sb pinned.
+ */
+ unsigned sb_pinned:1;
};
/*
struct bdi_writeback;
int inode_wait(void *);
void writeback_inodes_sb(struct super_block *);
+void writeback_inodes_sb_locked(struct super_block *);
int writeback_inodes_sb_if_idle(struct super_block *);
void sync_inodes_sb(struct super_block *);
void writeback_inodes_wbc(struct writeback_control *wbc);
/*
* mm/page-writeback.c
*/
-void laptop_io_completion(void);
+#ifdef CONFIG_BLOCK
+void laptop_io_completion(struct backing_dev_info *info);
void laptop_sync_completion(void);
+void laptop_mode_sync(struct work_struct *work);
+void laptop_mode_timer_fn(unsigned long data);
+#else
+static inline void laptop_sync_completion(void) { }
+#endif
void throttle_vm_writeout(gfp_t gfp_mask);
/* These are exported to sysctl. */
endif #CGROUP_SCHED
+config BLK_CGROUP
+ tristate "Block IO controller"
+ depends on CGROUPS && BLOCK
+ default n
+ ---help---
+ Generic block IO controller cgroup interface. This is the common
+ cgroup interface which should be used by various IO controlling
+ policies.
+
+ Currently, CFQ IO scheduler uses it to recognize task groups and
+ control disk bandwidth allocation (proportional time slice allocation)
+ to such task groups.
+
+ This option only enables generic Block IO controller infrastructure.
+ One needs to also enable actual IO controlling logic in CFQ for it
+ to take effect. (CONFIG_CFQ_GROUP_IOSCHED=y).
+
+ See Documentation/cgroups/blkio-controller.txt for more information.
+
+config DEBUG_BLK_CGROUP
+ bool "Enable Block IO controller debugging"
+ depends on BLK_CGROUP
+ default n
+ ---help---
+ Enable some debugging help. Currently it exports additional stat
+ files in a cgroup which can be useful for debugging.
+
endif # CGROUPS
config MM_OWNER
size_t read_subbuf = read_start / subbuf_size;
size_t padding = rbuf->padding[read_subbuf];
size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
.pages = pages,
.nr_pages = 0,
if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
return 0;
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
/*
* Adjust read len, if longer than what is available
subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
pidx = (read_start / PAGE_SIZE) % subbuf_pages;
poff = read_start & ~PAGE_MASK;
- nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS);
+ nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
unsigned int this_len, this_end, private;
}
}
+ ret = 0;
if (!spd.nr_pages)
- return 0;
+ goto out;
ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
if (ret < 0 || ret < total_len)
- return ret;
+ goto out;
if (read_start + ret == nonpad_end)
ret += padding;
+out:
+ splice_shrink_spd(pipe, &spd);
return ret;
}
return (unsigned long long)(jiffies - INITIAL_JIFFIES)
* (NSEC_PER_SEC / HZ);
}
+EXPORT_SYMBOL_GPL(sched_clock);
static __read_mostly int sched_clock_running;
#include <linux/slow-work.h>
#include <linux/perf_event.h>
#include <linux/kprobes.h>
+#include <linux/pipe_fs_i.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
.child = binfmt_misc_table,
},
#endif
+ {
+ .procname = "pipe-max-pages",
+ .data = &pipe_max_pages,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &two,
+ },
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
size_t len,
unsigned int flags)
{
- struct page *pages[PIPE_BUFFERS];
- struct partial_page partial[PIPE_BUFFERS];
+ struct page *pages_def[PIPE_DEF_BUFFERS];
+ struct partial_page partial_def[PIPE_DEF_BUFFERS];
struct trace_iterator *iter = filp->private_data;
struct splice_pipe_desc spd = {
- .pages = pages,
- .partial = partial,
+ .pages = pages_def,
+ .partial = partial_def,
.nr_pages = 0, /* This gets updated below. */
.flags = flags,
.ops = &tracing_pipe_buf_ops,
size_t rem;
unsigned int i;
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
if (unlikely(old_tracer != current_trace && current_trace)) {
trace_access_lock(iter->cpu_file);
/* Fill as many pages as possible. */
- for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
- pages[i] = alloc_page(GFP_KERNEL);
- if (!pages[i])
+ for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
+ spd.pages[i] = alloc_page(GFP_KERNEL);
+ if (!spd.pages[i])
break;
rem = tracing_fill_pipe_page(rem, iter);
/* Copy the data into the page, so we can start over. */
ret = trace_seq_to_buffer(&iter->seq,
- page_address(pages[i]),
+ page_address(spd.pages[i]),
iter->seq.len);
if (ret < 0) {
- __free_page(pages[i]);
+ __free_page(spd.pages[i]);
break;
}
- partial[i].offset = 0;
- partial[i].len = iter->seq.len;
+ spd.partial[i].offset = 0;
+ spd.partial[i].len = iter->seq.len;
trace_seq_init(&iter->seq);
}
spd.nr_pages = i;
- return splice_to_pipe(pipe, &spd);
+ ret = splice_to_pipe(pipe, &spd);
+out:
+ splice_shrink_spd(pipe, &spd);
+ return ret;
out_err:
mutex_unlock(&iter->mutex);
-
- return ret;
+ goto out;
}
static ssize_t
unsigned int flags)
{
struct ftrace_buffer_info *info = file->private_data;
- struct partial_page partial[PIPE_BUFFERS];
- struct page *pages[PIPE_BUFFERS];
+ struct partial_page partial_def[PIPE_DEF_BUFFERS];
+ struct page *pages_def[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
- .pages = pages,
- .partial = partial,
+ .pages = pages_def,
+ .partial = partial_def,
.flags = flags,
.ops = &buffer_pipe_buf_ops,
.spd_release = buffer_spd_release,
int entries, size, i;
size_t ret;
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
+
if (*ppos & (PAGE_SIZE - 1)) {
WARN_ONCE(1, "Ftrace: previous read must page-align\n");
- return -EINVAL;
+ ret = -EINVAL;
+ goto out;
}
if (len & (PAGE_SIZE - 1)) {
WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
- if (len < PAGE_SIZE)
- return -EINVAL;
+ if (len < PAGE_SIZE) {
+ ret = -EINVAL;
+ goto out;
+ }
len &= PAGE_MASK;
}
trace_access_lock(info->cpu);
entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
- for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
struct page *page;
int r;
else
ret = 0;
/* TODO: block */
- return ret;
+ goto out;
}
ret = splice_to_pipe(pipe, &spd);
-
+ splice_shrink_spd(pipe, &spd);
+out:
return ret;
}
static int bdi_sync_supers(void *);
static void sync_supers_timer_fn(unsigned long);
-static void arm_supers_timer(void);
static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
init_timer(&sync_supers_timer);
setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
- arm_supers_timer();
+ bdi_arm_supers_timer();
err = bdi_init(&default_backing_dev_info);
if (!err)
return 0;
}
-static void arm_supers_timer(void)
+void bdi_arm_supers_timer(void)
{
unsigned long next;
+ if (!dirty_writeback_interval)
+ return;
+
next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
mod_timer(&sync_supers_timer, round_jiffies_up(next));
}
static void sync_supers_timer_fn(unsigned long unused)
{
wake_up_process(sync_supers_tsk);
- arm_supers_timer();
+ bdi_arm_supers_timer();
}
static int bdi_forker_task(void *ptr)
spin_unlock_bh(&bdi_lock);
wait = msecs_to_jiffies(dirty_writeback_interval * 10);
- schedule_timeout(wait);
+ if (wait)
+ schedule_timeout(wait);
+ else
+ schedule();
try_to_freeze();
continue;
}
(!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
+ global_page_state(NR_UNSTABLE_NFS))
> background_thresh)))
- bdi_start_writeback(bdi, NULL, 0);
+ bdi_start_writeback(bdi, NULL, 0, 0);
}
void set_page_dirty_balance(struct page *page, int page_mkwrite)
}
}
-static void laptop_timer_fn(unsigned long unused);
-
-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
-
/*
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, buffer, length, ppos);
+ bdi_arm_supers_timer();
return 0;
}
-static void do_laptop_sync(struct work_struct *work)
+#ifdef CONFIG_BLOCK
+void laptop_mode_timer_fn(unsigned long data)
{
- wakeup_flusher_threads(0);
- kfree(work);
-}
+ struct request_queue *q = (struct request_queue *)data;
+ int nr_pages = global_page_state(NR_FILE_DIRTY) +
+ global_page_state(NR_UNSTABLE_NFS);
-static void laptop_timer_fn(unsigned long unused)
-{
- struct work_struct *work;
+ /*
+ * We want to write everything out, not just down to the dirty
+ * threshold
+ */
- work = kmalloc(sizeof(*work), GFP_ATOMIC);
- if (work) {
- INIT_WORK(work, do_laptop_sync);
- schedule_work(work);
- }
+ if (bdi_has_dirty_io(&q->backing_dev_info))
+ bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0);
}
/*
* of all dirty data a few seconds from now. If the flush is already scheduled
* then push it back - the user is still using the disk.
*/
-void laptop_io_completion(void)
+void laptop_io_completion(struct backing_dev_info *info)
{
- mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
+ mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
}
/*
*/
void laptop_sync_completion(void)
{
- del_timer(&laptop_mode_wb_timer);
+ struct backing_dev_info *bdi;
+
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+ del_timer(&bdi->laptop_mode_wb_timer);
+
+ rcu_read_unlock();
}
+#endif
/*
* If ratelimit_pages is too high then we can get into dirty-data overload
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+ nr_blocks, GFP_KERNEL,
+ BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
if (err)
return err;
cond_resched();
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+ nr_blocks, GFP_KERNEL,
+ BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
if (err)
break;
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
+ nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
+ BLKDEV_IFL_BARRIER))
break;
}
/*
* Fill page/offset/length into spd, if it can hold more pages.
*/
-static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+static inline int spd_fill_page(struct splice_pipe_desc *spd,
+ struct pipe_inode_info *pipe, struct page *page,
unsigned int *len, unsigned int offset,
struct sk_buff *skb, int linear,
struct sock *sk)
{
- if (unlikely(spd->nr_pages == PIPE_BUFFERS))
+ if (unlikely(spd->nr_pages == pipe->buffers))
return 1;
if (linear) {
unsigned int plen, unsigned int *off,
unsigned int *len, struct sk_buff *skb,
struct splice_pipe_desc *spd, int linear,
- struct sock *sk)
+ struct sock *sk,
+ struct pipe_inode_info *pipe)
{
if (!*len)
return 1;
/* the linear region may spread across several pages */
flen = min_t(unsigned int, flen, PAGE_SIZE - poff);
- if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk))
+ if (spd_fill_page(spd, pipe, page, &flen, poff, skb, linear, sk))
return 1;
__segment_seek(&page, &poff, &plen, flen);
* Map linear and fragment data from the skb to spd. It reports failure if the
* pipe is full or if we already spliced the requested length.
*/
-static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset,
- unsigned int *len, struct splice_pipe_desc *spd,
- struct sock *sk)
+static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
+ unsigned int *offset, unsigned int *len,
+ struct splice_pipe_desc *spd, struct sock *sk)
{
int seg;
if (__splice_segment(virt_to_page(skb->data),
(unsigned long) skb->data & (PAGE_SIZE - 1),
skb_headlen(skb),
- offset, len, skb, spd, 1, sk))
+ offset, len, skb, spd, 1, sk, pipe))
return 1;
/*
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
if (__splice_segment(f->page, f->page_offset, f->size,
- offset, len, skb, spd, 0, sk))
+ offset, len, skb, spd, 0, sk, pipe))
return 1;
}
struct pipe_inode_info *pipe, unsigned int tlen,
unsigned int flags)
{
- struct partial_page partial[PIPE_BUFFERS];
- struct page *pages[PIPE_BUFFERS];
+ struct partial_page partial[PIPE_DEF_BUFFERS];
+ struct page *pages[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
.pages = pages,
.partial = partial,
};
struct sk_buff *frag_iter;
struct sock *sk = skb->sk;
+ int ret = 0;
+
+ if (splice_grow_spd(pipe, &spd))
+ return -ENOMEM;
/*
* __skb_splice_bits() only fails if the output has no room left,
* so no point in going over the frag_list for the error case.
*/
- if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk))
+ if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
goto done;
else if (!tlen)
goto done;
skb_walk_frags(skb, frag_iter) {
if (!tlen)
break;
- if (__skb_splice_bits(frag_iter, &offset, &tlen, &spd, sk))
+ if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
break;
}
done:
if (spd.nr_pages) {
- int ret;
-
/*
* Drop the socket lock, otherwise we have reverse
* locking dependencies between sk_lock and i_mutex
release_sock(sk);
ret = splice_to_pipe(pipe, &spd);
lock_sock(sk);
- return ret;
}
- return 0;
+ splice_shrink_spd(pipe, &spd);
+ return ret;
}
/**