for-6.9/block-20240310

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmXuFO4QHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpq33D/9hyNyBce2A9iyo026eK8EqLDoed6BPzuvB
 kLKj5tsGvX4YlfuswvP86M5dgibTASXclnfUK394TijW/JPOfJ3mNhi9gMnHzRoK
 ZaR1di0Lum56dY1FkpMmWiGmE4fB79PAtXYKtajOkuoIcNzylncEAAACUY4/Ouhg
 Cm+LMg2prcc+m9g8rKDNQ51pUFg4U21KAUTl35XLMUAaQk1ahW3EDEVYhweC/zwE
 V/5hJsv8UY72+oQGY2Dc/YgQk/Zj4ZDh7C+oHR9XeB/ro99kr3/Vopagu0gBMLZi
 Rq6qqz6PVMhVcuz8uN2rsTQKXmXhsBn9/adsl4AKtdxcW5D5moWb5BLq1P0WQylc
 nzMxa1d6cVcTKZpaUQQv3Rj6ZMrLuDwP277UYHfn5x1oPWYRZCG7FtHuOo1gNcpG
 DrSNwVG6BSDcbABqI+MIS2oD1JoUMyevjwT7e2hOXukZhc6GLO5F3ODWE5j3KnCR
 S/aGSAmcdR4fTcgavULqWdQVt7SYl4f1IxT8KrUirJGVhc2LgahaWj69ooklVHoU
 fPDFRiruwJ5YkH4RWCSDm9mi4kAz6eUf+f4yE06wZOFOb2fT8/1ZK2Snpz2KeXuZ
 INO0RejtFzT8L0OUlu7dBmF20y6rgAYt87lR8mIt71yuuATIrVhzlX1VdsvhdrAo
 VLHGV1Ncgw==
 =WlVL
 -----END PGP SIGNATURE-----

Merge tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux

Pull block updates from Jens Axboe:

 - MD pull requests via Song:
      - Cleanup redundant checks (Yu Kuai)
      - Remove deprecated headers (Marc Zyngier, Song Liu)
      - Concurrency fixes (Li Lingfeng)
      - Memory leak fix (Li Nan)
      - Refactor raid1 read_balance (Yu Kuai, Paul Luse)
      - Clean up and fix for md_ioctl (Li Nan)
      - Other small fixes (Gui-Dong Han, Heming Zhao)
      - MD atomic limits (Christoph)

 - NVMe pull request via Keith:
      - RDMA target enhancements (Max)
      - Fabrics fixes (Max, Guixin, Hannes)
      - Atomic queue_limits usage (Christoph)
      - Const use for class_register (Ricardo)
      - Identification error handling fixes (Shin'ichiro, Keith)

 - Improvement and cleanup for cached request handling (Christoph)

 - Moving towards atomic queue limits. Core changes and driver bits so
   far (Christoph)

 - Fix UAF issues in aoeblk (Chun-Yi)

 - Zoned fix and cleanups (Damien)

 - s390 dasd cleanups and fixes (Jan, Miroslav)

 - Block issue timestamp caching (me)

 - noio scope guarding for zoned IO (Johannes)

 - block/nvme PI improvements (Kanchan)

 - Ability to terminate long running discard loop (Keith)

 - bdev revalidation fix (Li)

 - Get rid of old nr_queues hack for kdump kernels (Ming)

 - Support for async deletion of ublk (Ming)

 - Improve IRQ bio recycling (Pavel)

 - Factor in CPU capacity for remote vs local completion (Qais)

 - Add shared_tags configfs entry for null_blk (Shin'ichiro

 - Fix for a regression in page refcounts introduced by the folio
   unification (Tony)

 - Misc fixes and cleanups (Arnd, Colin, John, Kunwu, Li, Navid,
   Ricardo, Roman, Tang, Uwe)

* tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux: (221 commits)
  block: partitions: only define function mac_fix_string for CONFIG_PPC_PMAC
  block/swim: Convert to platform remove callback returning void
  cdrom: gdrom: Convert to platform remove callback returning void
  block: remove disk_stack_limits
  md: remove mddev->queue
  md: don't initialize queue limits
  md/raid10: use the atomic queue limit update APIs
  md/raid5: use the atomic queue limit update APIs
  md/raid1: use the atomic queue limit update APIs
  md/raid0: use the atomic queue limit update APIs
  md: add queue limit helpers
  md: add a mddev_is_dm helper
  md: add a mddev_add_trace_msg helper
  md: add a mddev_trace_remap helper
  bcache: move calculation of stripe_size and io_opt into bcache_device_init
  virtio_blk: Do not use disk_set_max_open/active_zones()
  aoe: fix the potential use-after-free problem in aoecmd_cfg_pkts
  block: move capacity validation to blkpg_do_ioctl()
  block: prevent division by zero in blk_rq_stat_sum()
  drbd: atomically update queue limits in drbd_reconsider_queue_parameters
  ...
This commit is contained in:
Linus Torvalds 2024-03-11 11:43:44 -07:00
commit 1ddeeb2a05
138 changed files with 3571 additions and 3298 deletions

View File

@ -96,6 +96,9 @@ static const struct block_device_operations nfhd_ops = {
static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
{
struct queue_limits lim = {
.logical_block_size = bsize,
};
struct nfhd_device *dev;
int dev_id = id - NFHD_DEV_OFFSET;
int err = -ENOMEM;
@ -117,9 +120,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
dev->bsize = bsize;
dev->bshift = ffs(bsize) - 10;
dev->disk = blk_alloc_disk(NUMA_NO_NODE);
if (!dev->disk)
dev->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(dev->disk)) {
err = PTR_ERR(dev->disk);
goto free_dev;
}
dev->disk->major = major_num;
dev->disk->first_minor = dev_id * 16;
@ -128,7 +133,6 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
dev->disk->private_data = dev;
sprintf(dev->disk->disk_name, "nfhd%u", dev_id);
set_capacity(dev->disk, (sector_t)blocks * (bsize / 512));
blk_queue_logical_block_size(dev->disk->queue, bsize);
err = add_disk(dev->disk);
if (err)
goto out_cleanup_disk;

View File

@ -108,8 +108,6 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data)
static DEFINE_MUTEX(ubd_lock);
static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */
static int ubd_open(struct gendisk *disk, blk_mode_t mode);
static void ubd_release(struct gendisk *disk);
static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
@ -118,16 +116,11 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
static const struct block_device_operations ubd_blops = {
.owner = THIS_MODULE,
.open = ubd_open,
.release = ubd_release,
.ioctl = ubd_ioctl,
.compat_ioctl = blkdev_compat_ptr_ioctl,
.getgeo = ubd_getgeo,
};
/* Protected by ubd_lock */
static struct gendisk *ubd_gendisk[MAX_DEV];
#ifdef CONFIG_BLK_DEV_UBD_SYNC
#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \
.cl = 1 })
@ -155,7 +148,6 @@ struct ubd {
* backing or the cow file. */
char *file;
char *serial;
int count;
int fd;
__u64 size;
struct openflags boot_openflags;
@ -165,7 +157,7 @@ struct ubd {
unsigned no_trim:1;
struct cow cow;
struct platform_device pdev;
struct request_queue *queue;
struct gendisk *disk;
struct blk_mq_tag_set tag_set;
spinlock_t lock;
};
@ -181,7 +173,6 @@ struct ubd {
#define DEFAULT_UBD { \
.file = NULL, \
.serial = NULL, \
.count = 0, \
.fd = -1, \
.size = -1, \
.boot_openflags = OPEN_FLAGS, \
@ -774,8 +765,6 @@ static int ubd_open_dev(struct ubd *ubd_dev)
ubd_dev->fd = fd;
if(ubd_dev->cow.file != NULL){
blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long));
err = -ENOMEM;
ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len);
if(ubd_dev->cow.bitmap == NULL){
@ -797,11 +786,6 @@ static int ubd_open_dev(struct ubd *ubd_dev)
if(err < 0) goto error;
ubd_dev->cow.fd = err;
}
if (ubd_dev->no_trim == 0) {
blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
}
blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue);
return 0;
error:
os_close_file(ubd_dev->fd);
@ -851,27 +835,6 @@ static const struct attribute_group *ubd_attr_groups[] = {
NULL,
};
static int ubd_disk_register(int major, u64 size, int unit,
struct gendisk *disk)
{
disk->major = major;
disk->first_minor = unit << UBD_SHIFT;
disk->minors = 1 << UBD_SHIFT;
disk->fops = &ubd_blops;
set_capacity(disk, size / 512);
sprintf(disk->disk_name, "ubd%c", 'a' + unit);
ubd_devs[unit].pdev.id = unit;
ubd_devs[unit].pdev.name = DRIVER_NAME;
ubd_devs[unit].pdev.dev.release = ubd_device_release;
dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]);
platform_device_register(&ubd_devs[unit].pdev);
disk->private_data = &ubd_devs[unit];
disk->queue = ubd_devs[unit].queue;
return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups);
}
#define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE))
static const struct blk_mq_ops ubd_mq_ops = {
@ -881,18 +844,36 @@ static const struct blk_mq_ops ubd_mq_ops = {
static int ubd_add(int n, char **error_out)
{
struct ubd *ubd_dev = &ubd_devs[n];
struct queue_limits lim = {
.max_segments = MAX_SG,
.seg_boundary_mask = PAGE_SIZE - 1,
};
struct gendisk *disk;
int err = 0;
if(ubd_dev->file == NULL)
goto out;
if (ubd_dev->cow.file)
lim.max_hw_sectors = 8 * sizeof(long);
if (!ubd_dev->no_trim) {
lim.max_hw_discard_sectors = UBD_MAX_REQUEST;
lim.max_write_zeroes_sectors = UBD_MAX_REQUEST;
}
err = ubd_file_size(ubd_dev, &ubd_dev->size);
if(err < 0){
*error_out = "Couldn't determine size of device's file";
goto out;
}
err = ubd_open_dev(ubd_dev);
if (err) {
pr_err("ubd%c: Can't open \"%s\": errno = %d\n",
'a' + n, ubd_dev->file, -err);
goto out;
}
ubd_dev->size = ROUND_BLOCK(ubd_dev->size);
ubd_dev->tag_set.ops = &ubd_mq_ops;
@ -904,29 +885,43 @@ static int ubd_add(int n, char **error_out)
err = blk_mq_alloc_tag_set(&ubd_dev->tag_set);
if (err)
goto out;
goto out_close;
disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev);
disk = blk_mq_alloc_disk(&ubd_dev->tag_set, &lim, ubd_dev);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_cleanup_tags;
}
ubd_dev->queue = disk->queue;
blk_queue_write_cache(ubd_dev->queue, true, false);
blk_queue_max_segments(ubd_dev->queue, MAX_SG);
blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1);
err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk);
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_write_cache(disk->queue, true, false);
disk->major = UBD_MAJOR;
disk->first_minor = n << UBD_SHIFT;
disk->minors = 1 << UBD_SHIFT;
disk->fops = &ubd_blops;
set_capacity(disk, ubd_dev->size / 512);
sprintf(disk->disk_name, "ubd%c", 'a' + n);
disk->private_data = ubd_dev;
set_disk_ro(disk, !ubd_dev->openflags.w);
ubd_dev->pdev.id = n;
ubd_dev->pdev.name = DRIVER_NAME;
ubd_dev->pdev.dev.release = ubd_device_release;
dev_set_drvdata(&ubd_dev->pdev.dev, ubd_dev);
platform_device_register(&ubd_dev->pdev);
err = device_add_disk(&ubd_dev->pdev.dev, disk, ubd_attr_groups);
if (err)
goto out_cleanup_disk;
ubd_gendisk[n] = disk;
return 0;
out_cleanup_disk:
put_disk(disk);
out_cleanup_tags:
blk_mq_free_tag_set(&ubd_dev->tag_set);
out_close:
ubd_close_dev(ubd_dev);
out:
return err;
}
@ -1012,7 +1007,6 @@ static int ubd_id(char **str, int *start_out, int *end_out)
static int ubd_remove(int n, char **error_out)
{
struct gendisk *disk = ubd_gendisk[n];
struct ubd *ubd_dev;
int err = -ENODEV;
@ -1023,15 +1017,15 @@ static int ubd_remove(int n, char **error_out)
if(ubd_dev->file == NULL)
goto out;
/* you cannot remove a open disk */
err = -EBUSY;
if(ubd_dev->count > 0)
goto out;
if (ubd_dev->disk) {
/* you cannot remove a open disk */
err = -EBUSY;
if (disk_openers(ubd_dev->disk))
goto out;
ubd_gendisk[n] = NULL;
if(disk != NULL){
del_gendisk(disk);
put_disk(disk);
del_gendisk(ubd_dev->disk);
ubd_close_dev(ubd_dev);
put_disk(ubd_dev->disk);
}
err = 0;
@ -1153,37 +1147,6 @@ static int __init ubd_driver_init(void){
device_initcall(ubd_driver_init);
static int ubd_open(struct gendisk *disk, blk_mode_t mode)
{
struct ubd *ubd_dev = disk->private_data;
int err = 0;
mutex_lock(&ubd_mutex);
if(ubd_dev->count == 0){
err = ubd_open_dev(ubd_dev);
if(err){
printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n",
disk->disk_name, ubd_dev->file, -err);
goto out;
}
}
ubd_dev->count++;
set_disk_ro(disk, !ubd_dev->openflags.w);
out:
mutex_unlock(&ubd_mutex);
return err;
}
static void ubd_release(struct gendisk *disk)
{
struct ubd *ubd_dev = disk->private_data;
mutex_lock(&ubd_mutex);
if(--ubd_dev->count == 0)
ubd_close_dev(ubd_dev);
mutex_unlock(&ubd_mutex);
}
static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
__u64 *cow_offset, unsigned long *bitmap,
__u64 bitmap_offset, unsigned long *bitmap_words,

View File

@ -264,16 +264,18 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
struct proc_dir_entry *procdir)
{
char tmp[2] = { '0' + which, 0 };
int err = -ENOMEM;
int err;
dev->fd = -1;
dev->filename = NULL;
spin_lock_init(&dev->lock);
dev->users = 0;
dev->gd = blk_alloc_disk(NUMA_NO_NODE);
if (!dev->gd)
dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE);
if (IS_ERR(dev->gd)) {
err = PTR_ERR(dev->gd);
goto out;
}
dev->gd->major = simdisk_major;
dev->gd->first_minor = which;
dev->gd->minors = SIMDISK_MINORS;

View File

@ -383,7 +383,7 @@ void __init bdev_cache_init(void)
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
SLAB_ACCOUNT|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)

View File

@ -127,7 +127,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
if (!bfqg_stats_waiting(stats))
return;
now = ktime_get_ns();
now = blk_time_get_ns();
if (now > stats->start_group_wait_time)
bfq_stat_add(&stats->group_wait_time,
now - stats->start_group_wait_time);
@ -144,7 +144,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
return;
if (bfqg == curr_bfqg)
return;
stats->start_group_wait_time = ktime_get_ns();
stats->start_group_wait_time = blk_time_get_ns();
bfqg_stats_mark_waiting(stats);
}
@ -156,7 +156,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
if (!bfqg_stats_empty(stats))
return;
now = ktime_get_ns();
now = blk_time_get_ns();
if (now > stats->start_empty_time)
bfq_stat_add(&stats->empty_time,
now - stats->start_empty_time);
@ -183,7 +183,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
if (bfqg_stats_empty(stats))
return;
stats->start_empty_time = ktime_get_ns();
stats->start_empty_time = blk_time_get_ns();
bfqg_stats_mark_empty(stats);
}
@ -192,7 +192,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
struct bfqg_stats *stats = &bfqg->stats;
if (bfqg_stats_idling(stats)) {
u64 now = ktime_get_ns();
u64 now = blk_time_get_ns();
if (now > stats->start_idle_time)
bfq_stat_add(&stats->idle_time,
@ -205,7 +205,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
{
struct bfqg_stats *stats = &bfqg->stats;
stats->start_idle_time = ktime_get_ns();
stats->start_idle_time = blk_time_get_ns();
bfqg_stats_mark_idling(stats);
}
@ -242,7 +242,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
u64 io_start_time_ns, blk_opf_t opf)
{
struct bfqg_stats *stats = &bfqg->stats;
u64 now = ktime_get_ns();
u64 now = blk_time_get_ns();
if (now > io_start_time_ns)
blkg_rwstat_add(&stats->service_time, opf,

View File

@ -1005,7 +1005,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
rq = rq_entry_fifo(bfqq->fifo.next);
if (rq == last || ktime_get_ns() < rq->fifo_time)
if (rq == last || blk_time_get_ns() < rq->fifo_time)
return NULL;
bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
@ -1829,7 +1829,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
* bfq_bfqq_update_budg_for_activation for
* details on the usage of the next variable.
*/
arrived_in_time = ktime_get_ns() <=
arrived_in_time = blk_time_get_ns() <=
bfqq->ttime.last_end_request +
bfqd->bfq_slice_idle * 3;
unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio);
@ -2208,7 +2208,7 @@ static void bfq_add_request(struct request *rq)
struct request *next_rq, *prev;
unsigned int old_wr_coeff = bfqq->wr_coeff;
bool interactive = false;
u64 now_ns = ktime_get_ns();
u64 now_ns = blk_time_get_ns();
bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
bfqq->queued[rq_is_sync(rq)]++;
@ -2262,7 +2262,7 @@ static void bfq_add_request(struct request *rq)
bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) &&
time_is_before_eq_jiffies(bfqq->decrease_time_jif +
msecs_to_jiffies(10))) {
bfqd->last_empty_occupied_ns = ktime_get_ns();
bfqd->last_empty_occupied_ns = blk_time_get_ns();
/*
* Start the state machine for measuring the
* total service time of rq: setting
@ -3294,7 +3294,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd,
else
timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
bfqd->last_budget_start = ktime_get();
bfqd->last_budget_start = blk_time_get();
bfqq->budget_timeout = jiffies +
bfqd->bfq_timeout * timeout_coeff;
@ -3394,7 +3394,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
else if (bfqq->wr_coeff > 1)
sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
bfqd->last_idling_start = ktime_get();
bfqd->last_idling_start = blk_time_get();
bfqd->last_idling_start_jiffies = jiffies;
hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
@ -3433,7 +3433,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd,
struct request *rq)
{
if (rq != NULL) { /* new rq dispatch now, reset accordingly */
bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns();
bfqd->peak_rate_samples = 1;
bfqd->sequential_samples = 0;
bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
@ -3590,7 +3590,7 @@ reset_computation:
*/
static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
{
u64 now_ns = ktime_get_ns();
u64 now_ns = blk_time_get_ns();
if (bfqd->peak_rate_samples == 0) { /* first dispatch */
bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
@ -4162,7 +4162,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (compensate)
delta_ktime = bfqd->last_idling_start;
else
delta_ktime = ktime_get();
delta_ktime = blk_time_get();
delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
delta_usecs = ktime_to_us(delta_ktime);
@ -5591,7 +5591,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_io_cq *bic, pid_t pid, int is_sync,
unsigned int act_idx)
{
u64 now_ns = ktime_get_ns();
u64 now_ns = blk_time_get_ns();
bfqq->actuator_idx = act_idx;
RB_CLEAR_NODE(&bfqq->entity.rb_node);
@ -5903,7 +5903,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
*/
if (bfqq->dispatched || bfq_bfqq_busy(bfqq))
return;
elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request;
elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
@ -6194,7 +6194,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bfq_add_request(rq);
idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
list_add_tail(&rq->queuelist, &bfqq->fifo);
bfq_rq_enqueued(bfqd, bfqq, rq);
@ -6370,7 +6370,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
bfq_weights_tree_remove(bfqq);
}
now_ns = ktime_get_ns();
now_ns = blk_time_get_ns();
bfqq->ttime.last_end_request = now_ns;
@ -6585,7 +6585,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
static void bfq_update_inject_limit(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns;
unsigned int old_limit = bfqq->inject_limit;
if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {

View File

@ -395,6 +395,7 @@ static blk_status_t bio_integrity_process(struct bio *bio,
iter.tuple_size = bi->tuple_size;
iter.seed = proc_iter->bi_sector;
iter.prot_buf = bvec_virt(bip->bip_vec);
iter.pi_offset = bi->pi_offset;
__bio_for_each_segment(bv, bio, bviter, *proc_iter) {
void *kaddr = bvec_kmap_local(&bv);

View File

@ -16,7 +16,6 @@
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/highmem.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
@ -763,29 +762,31 @@ static inline void bio_put_percpu_cache(struct bio *bio)
struct bio_alloc_cache *cache;
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) {
put_cpu();
bio_free(bio);
return;
}
if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX)
goto out_free;
bio_uninit(bio);
if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) {
if (in_task()) {
bio_uninit(bio);
bio->bi_next = cache->free_list;
/* Not necessary but helps not to iopoll already freed bios */
bio->bi_bdev = NULL;
cache->free_list = bio;
cache->nr++;
} else {
unsigned long flags;
} else if (in_hardirq()) {
lockdep_assert_irqs_disabled();
local_irq_save(flags);
bio_uninit(bio);
bio->bi_next = cache->free_list_irq;
cache->free_list_irq = bio;
cache->nr_irq++;
local_irq_restore(flags);
} else {
goto out_free;
}
put_cpu();
return;
out_free:
put_cpu();
bio_free(bio);
}
/**
@ -1154,7 +1155,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
bio_for_each_folio_all(fi, bio) {
struct page *page;
size_t done = 0;
size_t nr_pages;
if (mark_dirty) {
folio_lock(fi.folio);
@ -1162,10 +1163,11 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
folio_unlock(fi.folio);
}
page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
fi.offset / PAGE_SIZE + 1;
do {
bio_release_page(bio, page++);
done += PAGE_SIZE;
} while (done < fi.length);
} while (--nr_pages != 0);
}
}
EXPORT_SYMBOL_GPL(__bio_release_pages);
@ -1371,21 +1373,12 @@ int submit_bio_wait(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
unsigned long hang_check;
bio->bi_private = &done;
bio->bi_end_io = submit_bio_wait_endio;
bio->bi_opf |= REQ_SYNC;
submit_bio(bio);
/* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&done,
hang_check * (HZ/2)))
;
else
wait_for_completion_io(&done);
blk_wait_io(&done);
return blk_status_to_errno(bio->bi_status);
}

View File

@ -1846,7 +1846,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
unsigned long pflags;
bool clamp;
u64 now = ktime_to_ns(ktime_get());
u64 now = blk_time_get_ns();
u64 exp;
u64 delay_nsec = 0;
int tok;

View File

@ -19,6 +19,7 @@
#include <linux/kthread.h>
#include <linux/blk-mq.h>
#include <linux/llist.h>
#include "blk.h"
struct blkcg_gq;
struct blkg_policy_data;

View File

@ -394,24 +394,34 @@ static void blk_timeout_work(struct work_struct *work)
{
}
struct request_queue *blk_alloc_queue(int node_id)
struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
{
struct request_queue *q;
int error;
q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
node_id);
if (!q)
return NULL;
return ERR_PTR(-ENOMEM);
q->last_merge = NULL;
q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
if (q->id < 0)
if (q->id < 0) {
error = q->id;
goto fail_q;
}
q->stats = blk_alloc_queue_stats();
if (!q->stats)
if (!q->stats) {
error = -ENOMEM;
goto fail_id;
}
error = blk_set_default_limits(lim);
if (error)
goto fail_stats;
q->limits = *lim;
q->node = node_id;
@ -425,6 +435,7 @@ struct request_queue *blk_alloc_queue(int node_id)
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
mutex_init(&q->limits_lock);
mutex_init(&q->rq_qos_mutex);
spin_lock_init(&q->queue_lock);
@ -435,12 +446,12 @@ struct request_queue *blk_alloc_queue(int node_id)
* Init percpu_ref in atomic mode so that it's faster to shutdown.
* See blk_register_queue() for details.
*/
if (percpu_ref_init(&q->q_usage_counter,
error = percpu_ref_init(&q->q_usage_counter,
blk_queue_usage_counter_release,
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
if (error)
goto fail_stats;
blk_set_default_limits(&q->limits);
q->nr_requests = BLKDEV_DEFAULT_RQ;
return q;
@ -451,7 +462,7 @@ fail_id:
ida_free(&blk_queue_ida, q->id);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
return ERR_PTR(error);
}
/**
@ -1083,6 +1094,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
if (tsk->plug)
return;
plug->cur_ktime = 0;
plug->mq_list = NULL;
plug->cached_rq = NULL;
plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
@ -1182,6 +1194,8 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
*/
if (unlikely(!rq_list_empty(plug->cached_rq)))
blk_mq_free_plug_rqs(plug);
current->flags &= ~PF_BLOCK_TS;
}
/**
@ -1229,8 +1243,7 @@ int __init blk_dev_init(void)
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC);
blk_debugfs_root = debugfs_create_dir("block", NULL);

View File

@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq)
part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]);
part_stat_add(part, nsecs[STAT_FLUSH],
ktime_get_ns() - rq->start_time_ns);
blk_time_get_ns() - rq->start_time_ns);
part_stat_unlock();
}

View File

@ -370,6 +370,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->profile = template->profile ? template->profile : &nop_profile;
bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size;
bi->pi_offset = template->pi_offset;
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);

View File

@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
/* step up/down based on the vrate */
vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
now_ns = ktime_get_ns();
now_ns = blk_time_get_ns();
if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
if (!ioc->autop_too_fast_at)
@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
unsigned seq;
u64 vrate;
now->now_ns = ktime_get();
now->now_ns = blk_time_get_ns();
now->now = ktime_to_us(now->now_ns);
vrate = atomic64_read(&ioc->vtime_rate);
@ -2817,7 +2817,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
return;
}
on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
on_q_ns = blk_time_get_ns() - rq->alloc_time_ns;
rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
@ -2900,7 +2900,7 @@ static int blk_iocost_init(struct gendisk *disk)
ioc->vtime_base_rate = VTIME_PER_USEC;
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
ioc->period_at = ktime_to_us(ktime_get());
ioc->period_at = ktime_to_us(blk_time_get());
atomic64_set(&ioc->cur_period, 0);
atomic_set(&ioc->hweight_gen, 0);

View File

@ -609,7 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
if (!iolat->blkiolat->enabled)
return;
now = ktime_to_ns(ktime_get());
now = blk_time_get_ns();
while (blkg && blkg->parent) {
iolat = blkg_to_lat(blkg);
if (!iolat) {
@ -661,7 +661,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
struct blkcg_gq *blkg;
struct cgroup_subsys_state *pos_css;
u64 now = ktime_to_ns(ktime_get());
u64 now = blk_time_get_ns();
rcu_read_lock();
blkg_for_each_descendant_pre(blkg, pos_css,
@ -985,7 +985,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
struct blkcg_gq *blkg = lat_to_blkg(iolat);
struct rq_qos *rqos = iolat_rq_qos(blkg->q);
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
u64 now = ktime_to_ns(ktime_get());
u64 now = blk_time_get_ns();
int cpu;
if (blk_queue_nonrot(blkg->q))

View File

@ -35,6 +35,26 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
}
static void await_bio_endio(struct bio *bio)
{
complete(bio->bi_private);
bio_put(bio);
}
/*
* await_bio_chain - ends @bio and waits for every chained bio to complete
*/
static void await_bio_chain(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
bio->bi_private = &done;
bio->bi_end_io = await_bio_endio;
bio_endio(bio);
blk_wait_io(&done);
}
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
{
@ -77,6 +97,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
* is disabled.
*/
cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
return -EINTR;
}
}
*biop = bio;
@ -120,32 +144,33 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
struct bio **biop, unsigned flags)
{
struct bio *bio = *biop;
unsigned int max_write_zeroes_sectors;
unsigned int max_sectors;
if (bdev_read_only(bdev))
return -EPERM;
/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
/* Ensure that max_sectors doesn't overflow bi_size */
max_sectors = bdev_write_zeroes_sectors(bdev);
if (max_write_zeroes_sectors == 0)
if (max_sectors == 0)
return -EOPNOTSUPP;
while (nr_sects) {
unsigned int len = min_t(sector_t, nr_sects, max_sectors);
bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
bio->bi_iter.bi_sector = sector;
if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP;
if (nr_sects > max_write_zeroes_sectors) {
bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
nr_sects -= max_write_zeroes_sectors;
sector += max_write_zeroes_sectors;
} else {
bio->bi_iter.bi_size = nr_sects << 9;
nr_sects = 0;
}
bio->bi_iter.bi_size = len << SECTOR_SHIFT;
nr_sects -= len;
sector += len;
cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
return -EINTR;
}
}
*biop = bio;
@ -190,6 +215,10 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
break;
}
cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
return -EINTR;
}
}
*biop = bio;
@ -280,7 +309,7 @@ retry:
bio_put(bio);
}
blk_finish_plug(&plug);
if (ret && try_write_zeroes) {
if (ret && ret != -EINTR && try_write_zeroes) {
if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
try_write_zeroes = false;
goto retry;
@ -322,7 +351,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
return -EPERM;
blk_start_plug(&plug);
for (;;) {
while (nr_sects) {
unsigned int len = min_t(sector_t, nr_sects, max_sectors);
bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp);
@ -331,12 +360,17 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
sector += len;
nr_sects -= len;
if (!nr_sects) {
ret = submit_bio_wait(bio);
bio_put(bio);
cond_resched();
if (fatal_signal_pending(current)) {
await_bio_chain(bio);
ret = -EINTR;
bio = NULL;
break;
}
cond_resched();
}
if (bio) {
ret = submit_bio_wait(bio);
bio_put(bio);
}
blk_finish_plug(&plug);

View File

@ -21,7 +21,6 @@
#include <linux/llist.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
@ -322,7 +321,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
RB_CLEAR_NODE(&rq->rb_node);
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns();
rq->start_time_ns = blk_time_get_ns();
rq->part = NULL;
blk_crypto_rq_set_defaults(rq);
}
@ -332,7 +331,7 @@ EXPORT_SYMBOL(blk_rq_init);
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{
if (blk_mq_need_time_stamp(rq))
rq->start_time_ns = ktime_get_ns();
rq->start_time_ns = blk_time_get_ns();
else
rq->start_time_ns = 0;
@ -443,7 +442,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
alloc_time_ns = blk_time_get_ns();
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
@ -628,7 +627,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
alloc_time_ns = ktime_get_ns();
alloc_time_ns = blk_time_get_ns();
/*
* If the tag allocator sleeps we could get an allocation for a
@ -1041,7 +1040,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_mq_need_time_stamp(rq))
__blk_mq_end_request_acct(rq, ktime_get_ns());
__blk_mq_end_request_acct(rq, blk_time_get_ns());
blk_mq_finish_request(rq);
@ -1084,7 +1083,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
u64 now = 0;
if (iob->need_ts)
now = ktime_get_ns();
now = blk_time_get_ns();
while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
prefetch(rq->bio);
@ -1167,10 +1166,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
if (force_irqthreads())
return false;
/* same CPU or cache domain? Complete locally */
/* same CPU or cache domain and capacity? Complete locally */
if (cpu == rq->mq_ctx->cpu ||
(!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
cpus_share_cache(cpu, rq->mq_ctx->cpu)))
cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
return false;
/* don't try to IPI to an offline CPU */
@ -1254,7 +1254,7 @@ void blk_mq_start_request(struct request *rq)
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
!blk_rq_is_passthrough(rq)) {
rq->io_start_time_ns = ktime_get_ns();
rq->io_start_time_ns = blk_time_get_ns();
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
rq_qos_issue(q, rq);
@ -1409,22 +1409,10 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head)
blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
blk_mq_run_hw_queue(hctx, false);
if (blk_rq_is_poll(rq)) {
if (blk_rq_is_poll(rq))
blk_rq_poll_completion(rq, &wait.done);
} else {
/*
* Prevent hang_check timer from firing at us during very long
* I/O
*/
unsigned long hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&wait.done,
hang_check * (HZ/2)))
;
else
wait_for_completion_io(&wait.done);
}
else
blk_wait_io(&wait.done);
return wait.ret;
}
@ -2892,9 +2880,6 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
};
struct request *rq;
if (blk_mq_attempt_bio_merge(q, bio, nsegs))
return NULL;
rq_qos_throttle(q, bio);
if (plug) {
@ -2913,23 +2898,32 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
}
/*
* Check if we can use the passed on request for submitting the passed in bio,
* and remove it from the request list if it can be used.
* Check if there is a suitable cached request and return it.
*/
static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
struct request_queue *q, blk_opf_t opf)
{
enum hctx_type type = blk_mq_get_hctx_type(opf);
struct request *rq;
if (!plug)
return NULL;
rq = rq_list_peek(&plug->cached_rq);
if (!rq || rq->q != q)
return NULL;
if (type != rq->mq_hctx->type &&
(type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT))
return NULL;
if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
return NULL;
return rq;
}
static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio)
{
enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
enum hctx_type hctx_type = rq->mq_hctx->type;
WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
if (type != hctx_type &&
!(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT))
return false;
if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
return false;
/*
* If any qos ->throttle() end up blocking, we will have flushed the
* plug and hence killed the cached_rq list as well. Pop this entry
@ -2941,7 +2935,6 @@ static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
blk_mq_rq_time_init(rq, 0);
rq->cmd_flags = bio->bi_opf;
INIT_LIST_HEAD(&rq->queuelist);
return true;
}
/**
@ -2963,50 +2956,43 @@ void blk_mq_submit_bio(struct bio *bio)
struct blk_plug *plug = blk_mq_plug(bio);
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
struct request *rq = NULL;
unsigned int nr_segs = 1;
struct request *rq;
blk_status_t ret;
bio = blk_queue_bounce(bio, q);
if (plug) {
rq = rq_list_peek(&plug->cached_rq);
if (rq && rq->q != q)
rq = NULL;
}
if (rq) {
if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
return;
}
if (!bio_integrity_prep(bio))
return;
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
return;
if (blk_mq_use_cached_rq(rq, plug, bio))
goto done;
percpu_ref_get(&q->q_usage_counter);
} else {
/*
* If the plug has a cached request for this queue, try use it.
*
* The cached request already holds a q_usage_counter reference and we
* don't have to acquire a new one if we use it.
*/
rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
if (!rq) {
if (unlikely(bio_queue_enter(bio)))
return;
if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
goto fail;
}
if (!bio_integrity_prep(bio))
goto fail;
}
rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq)) {
fail:
blk_queue_exit(q);
return;
if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
if (!bio)
goto queue_exit;
}
if (!bio_integrity_prep(bio))
goto queue_exit;
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
goto queue_exit;
if (!rq) {
rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq))
goto queue_exit;
} else {
blk_mq_use_cached_rq(rq, plug, bio);
}
done:
trace_block_getrq(bio);
rq_qos_track(q, rq, bio);
@ -3037,6 +3023,15 @@ done:
} else {
blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
}
return;
queue_exit:
/*
* Don't drop the queue reference if we were trying to use a cached
* request and thus didn't acquire one.
*/
if (!rq)
blk_queue_exit(q);
}
#ifdef CONFIG_BLK_MQ_STACKING
@ -3098,7 +3093,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
blk_mq_run_dispatch_ops(q,
ret = blk_mq_request_issue_directly(rq, true));
if (ret)
blk_account_io_done(rq, ktime_get_ns());
blk_account_io_done(rq, blk_time_get_ns());
return ret;
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
@ -4078,15 +4073,16 @@ void blk_mq_release(struct request_queue *q)
blk_mq_sysfs_deinit(q);
}
static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
void *queuedata)
struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata)
{
struct queue_limits default_lim = { };
struct request_queue *q;
int ret;
q = blk_alloc_queue(set->numa_node);
if (!q)
return ERR_PTR(-ENOMEM);
q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node);
if (IS_ERR(q))
return q;
q->queuedata = queuedata;
ret = blk_mq_init_allocated_queue(set, q);
if (ret) {
@ -4095,20 +4091,15 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
}
return q;
}
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
return blk_mq_init_queue_data(set, NULL);
}
EXPORT_SYMBOL(blk_mq_init_queue);
EXPORT_SYMBOL(blk_mq_alloc_queue);
/**
* blk_mq_destroy_queue - shutdown a request queue
* @q: request queue to shutdown
*
* This shuts down a request queue allocated by blk_mq_init_queue(). All future
* This shuts down a request queue allocated by blk_mq_alloc_queue(). All future
* requests will be failed with -ENODEV. The caller is responsible for dropping
* the reference from blk_mq_init_queue() by calling blk_put_queue().
* the reference from blk_mq_alloc_queue() by calling blk_put_queue().
*
* Context: can sleep
*/
@ -4129,13 +4120,14 @@ void blk_mq_destroy_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_mq_destroy_queue);
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
struct queue_limits *lim, void *queuedata,
struct lock_class_key *lkclass)
{
struct request_queue *q;
struct gendisk *disk;
q = blk_mq_init_queue_data(set, queuedata);
q = blk_mq_alloc_queue(set, lim, queuedata);
if (IS_ERR(q))
return ERR_CAST(q);
@ -4389,7 +4381,7 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
if (set->nr_maps == 1)
set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
if (set->ops->map_queues && !is_kdump_kernel()) {
if (set->ops->map_queues) {
int i;
/*
@ -4488,14 +4480,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
/*
* If a crashdump is active, then we are potentially in a very
* memory constrained environment. Limit us to 1 queue and
* 64 tags to prevent using too much memory.
* memory constrained environment. Limit us to 64 tags to prevent
* using too much memory.
*/
if (is_kdump_kernel()) {
set->nr_hw_queues = 1;
set->nr_maps = 1;
if (is_kdump_kernel())
set->queue_depth = min(64U, set->queue_depth);
}
/*
* There is no use for more h/w queues than cpus if we just have
* a single map
@ -4525,7 +4515,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
GFP_KERNEL, set->numa_node);
if (!set->map[i].mq_map)
goto out_free_mq_map;
set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
set->map[i].nr_queues = set->nr_hw_queues;
}
blk_mq_update_queue_map(set);

View File

@ -25,53 +25,22 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
}
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
/**
* blk_set_default_limits - reset limits to default values
* @lim: the queue_limits structure to reset
*
* Description:
* Returns a queue_limit struct to its default state.
*/
void blk_set_default_limits(struct queue_limits *lim)
{
lim->max_segments = BLK_MAX_SEGMENTS;
lim->max_discard_segments = 1;
lim->max_integrity_segments = 0;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
lim->virt_boundary_mask = 0;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
lim->max_user_sectors = lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
lim->max_write_zeroes_sectors = 0;
lim->max_zone_append_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->max_secure_erase_sectors = 0;
lim->discard_granularity = 512;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
lim->bounce = BLK_BOUNCE_NONE;
lim->alignment_offset = 0;
lim->io_opt = 0;
lim->misaligned = 0;
lim->zoned = false;
lim->zone_write_granularity = 0;
lim->dma_alignment = 511;
}
/**
* blk_set_stacking_limits - set default limits for stacking devices
* @lim: the queue_limits structure to reset
*
* Description:
* Returns a queue_limit struct to its default state. Should be used
* by stacking drivers like DM that have no internal limits.
* Prepare queue limits for applying limits from underlying devices using
* blk_stack_limits().
*/
void blk_set_stacking_limits(struct queue_limits *lim)
{
blk_set_default_limits(lim);
memset(lim, 0, sizeof(*lim));
lim->logical_block_size = SECTOR_SIZE;
lim->physical_block_size = SECTOR_SIZE;
lim->io_min = SECTOR_SIZE;
lim->discard_granularity = SECTOR_SIZE;
lim->dma_alignment = SECTOR_SIZE - 1;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
/* Inherit limits from component devices */
lim->max_segments = USHRT_MAX;
@ -82,9 +51,239 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
lim->max_zone_append_sectors = UINT_MAX;
lim->max_user_discard_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
static void blk_apply_bdi_limits(struct backing_dev_info *bdi,
struct queue_limits *lim)
{
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
*/
bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
}
static int blk_validate_zoned_limits(struct queue_limits *lim)
{
if (!lim->zoned) {
if (WARN_ON_ONCE(lim->max_open_zones) ||
WARN_ON_ONCE(lim->max_active_zones) ||
WARN_ON_ONCE(lim->zone_write_granularity) ||
WARN_ON_ONCE(lim->max_zone_append_sectors))
return -EINVAL;
return 0;
}
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
return -EINVAL;
if (lim->zone_write_granularity < lim->logical_block_size)
lim->zone_write_granularity = lim->logical_block_size;
if (lim->max_zone_append_sectors) {
/*
* The Zone Append size is limited by the maximum I/O size
* and the zone size given that it can't span zones.
*/
lim->max_zone_append_sectors =
min3(lim->max_hw_sectors,
lim->max_zone_append_sectors,
lim->chunk_sectors);
}
return 0;
}
/*
* Check that the limits in lim are valid, initialize defaults for unset
* values, and cap values based on others where needed.
*/
static int blk_validate_limits(struct queue_limits *lim)
{
unsigned int max_hw_sectors;
/*
* Unless otherwise specified, default to 512 byte logical blocks and a
* physical block size equal to the logical block size.
*/
if (!lim->logical_block_size)
lim->logical_block_size = SECTOR_SIZE;
if (lim->physical_block_size < lim->logical_block_size)
lim->physical_block_size = lim->logical_block_size;
/*
* The minimum I/O size defaults to the physical block size unless
* explicitly overridden.
*/
if (lim->io_min < lim->physical_block_size)
lim->io_min = lim->physical_block_size;
/*
* max_hw_sectors has a somewhat weird default for historical reason,
* but driver really should set their own instead of relying on this
* value.
*
* The block layer relies on the fact that every driver can
* handle at lest a page worth of data per I/O, and needs the value
* aligned to the logical block size.
*/
if (!lim->max_hw_sectors)
lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS))
return -EINVAL;
lim->max_hw_sectors = round_down(lim->max_hw_sectors,
lim->logical_block_size >> SECTOR_SHIFT);
/*
* The actual max_sectors value is a complex beast and also takes the
* max_dev_sectors value (set by SCSI ULPs) and a user configurable
* value into account. The ->max_sectors value is always calculated
* from these, so directly setting it won't have any effect.
*/
max_hw_sectors = min_not_zero(lim->max_hw_sectors,
lim->max_dev_sectors);
if (lim->max_user_sectors) {
if (lim->max_user_sectors > max_hw_sectors ||
lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)
return -EINVAL;
lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
} else {
lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
}
lim->max_sectors = round_down(lim->max_sectors,
lim->logical_block_size >> SECTOR_SHIFT);
/*
* Random default for the maximum number of segments. Driver should not
* rely on this and set their own.
*/
if (!lim->max_segments)
lim->max_segments = BLK_MAX_SEGMENTS;
lim->max_discard_sectors =
min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
if (!lim->max_discard_segments)
lim->max_discard_segments = 1;
if (lim->discard_granularity < lim->physical_block_size)
lim->discard_granularity = lim->physical_block_size;
/*
* By default there is no limit on the segment boundary alignment,
* but if there is one it can't be smaller than the page size as
* that would break all the normal I/O patterns.
*/
if (!lim->seg_boundary_mask)
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1))
return -EINVAL;
/*
* Devices that require a virtual boundary do not support scatter/gather
* I/O natively, but instead require a descriptor list entry for each
* page (which might not be identical to the Linux PAGE_SIZE). Because
* of that they are not limited by our notion of "segment size".
*/
if (lim->virt_boundary_mask) {
if (WARN_ON_ONCE(lim->max_segment_size &&
lim->max_segment_size != UINT_MAX))
return -EINVAL;
lim->max_segment_size = UINT_MAX;
} else {
/*
* The maximum segment size has an odd historic 64k default that
* drivers probably should override. Just like the I/O size we
* require drivers to at least handle a full page per segment.
*/
if (!lim->max_segment_size)
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE))
return -EINVAL;
}
/*
* We require drivers to at least do logical block aligned I/O, but
* historically could not check for that due to the separate calls
* to set the limits. Once the transition is finished the check
* below should be narrowed down to check the logical block size.
*/
if (!lim->dma_alignment)
lim->dma_alignment = SECTOR_SIZE - 1;
if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE))
return -EINVAL;
if (lim->alignment_offset) {
lim->alignment_offset &= (lim->physical_block_size - 1);
lim->misaligned = 0;
}
return blk_validate_zoned_limits(lim);
}
/*
* Set the default limits for a newly allocated queue. @lim contains the
* initial limits set by the driver, which could be no limit in which case
* all fields are cleared to zero.
*/
int blk_set_default_limits(struct queue_limits *lim)
{
/*
* Most defaults are set by capping the bounds in blk_validate_limits,
* but max_user_discard_sectors is special and needs an explicit
* initialization to the max value here.
*/
lim->max_user_discard_sectors = UINT_MAX;
return blk_validate_limits(lim);
}
/**
* queue_limits_commit_update - commit an atomic update of queue limits
* @q: queue to update
* @lim: limits to apply
*
* Apply the limits in @lim that were obtained from queue_limits_start_update()
* and updated by the caller to @q.
*
* Returns 0 if successful, else a negative error code.
*/
int queue_limits_commit_update(struct request_queue *q,
struct queue_limits *lim)
__releases(q->limits_lock)
{
int error = blk_validate_limits(lim);
if (!error) {
q->limits = *lim;
if (q->disk)
blk_apply_bdi_limits(q->disk->bdi, lim);
}
mutex_unlock(&q->limits_lock);
return error;
}
EXPORT_SYMBOL_GPL(queue_limits_commit_update);
/**
* queue_limits_set - apply queue limits to queue
* @q: queue to update
* @lim: limits to apply
*
* Apply the limits in @lim that were freshly initialized to @q.
* To update existing limits use queue_limits_start_update() and
* queue_limits_commit_update() instead.
*
* Returns 0 if successful, else a negative error code.
*/
int queue_limits_set(struct request_queue *q, struct queue_limits *lim)
{
mutex_lock(&q->limits_lock);
return queue_limits_commit_update(q, lim);
}
EXPORT_SYMBOL_GPL(queue_limits_set);
/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device
@ -177,8 +376,11 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors);
void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors)
{
q->limits.max_hw_discard_sectors = max_discard_sectors;
q->limits.max_discard_sectors = max_discard_sectors;
struct queue_limits *lim = &q->limits;
lim->max_hw_discard_sectors = max_discard_sectors;
lim->max_discard_sectors =
min(max_discard_sectors, lim->max_user_discard_sectors);
}
EXPORT_SYMBOL(blk_queue_max_discard_sectors);
@ -393,15 +595,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset);
void disk_update_readahead(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
*/
disk->bdi->ra_pages =
max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9);
blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
}
EXPORT_SYMBOL_GPL(disk_update_readahead);
@ -689,33 +883,38 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->zone_write_granularity = max(t->zone_write_granularity,
b->zone_write_granularity);
t->zoned = max(t->zoned, b->zoned);
if (!t->zoned) {
t->zone_write_granularity = 0;
t->max_zone_append_sectors = 0;
}
return ret;
}
EXPORT_SYMBOL(blk_stack_limits);
/**
* disk_stack_limits - adjust queue limits for stacked drivers
* @disk: MD/DM gendisk (top)
* queue_limits_stack_bdev - adjust queue_limits for stacked devices
* @t: the stacking driver limits (top device)
* @bdev: the underlying block device (bottom)
* @offset: offset to beginning of data within component device
* @pfx: prefix to use for warnings logged
*
* Description:
* Merges the limits for a top level gendisk and a bottom level
* block_device.
* This function is used by stacking drivers like MD and DM to ensure
* that all component devices have compatible block sizes and
* alignments. The stacking driver must provide a queue_limits
* struct (top) and then iteratively call the stacking function for
* all component (bottom) devices. The stacking function will
* attempt to combine the values and ensure proper alignment.
*/
void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
sector_t offset)
void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
sector_t offset, const char *pfx)
{
struct request_queue *t = disk->queue;
if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits,
get_start_sect(bdev) + (offset >> 9)) < 0)
if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits,
get_start_sect(bdev) + offset))
pr_notice("%s: Warning: Device %pg is misaligned\n",
disk->disk_name, bdev);
disk_update_readahead(disk);
pfx, bdev);
}
EXPORT_SYMBOL(disk_stack_limits);
EXPORT_SYMBOL_GPL(queue_limits_stack_bdev);
/**
* blk_queue_update_dma_pad - update pad mask

View File

@ -27,7 +27,7 @@ void blk_rq_stat_init(struct blk_rq_stat *stat)
/* src is a per-cpu stat, mean isn't initialized */
void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
if (!src->nr_samples)
if (dst->nr_samples + src->nr_samples <= dst->nr_samples)
return;
dst->min = min(dst->min, src->min);

View File

@ -174,23 +174,29 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
static ssize_t queue_discard_max_store(struct request_queue *q,
const char *page, size_t count)
{
unsigned long max_discard;
ssize_t ret = queue_var_store(&max_discard, page, count);
unsigned long max_discard_bytes;
struct queue_limits lim;
ssize_t ret;
int err;
ret = queue_var_store(&max_discard_bytes, page, count);
if (ret < 0)
return ret;
if (max_discard & (q->limits.discard_granularity - 1))
if (max_discard_bytes & (q->limits.discard_granularity - 1))
return -EINVAL;
max_discard >>= 9;
if (max_discard > UINT_MAX)
if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX)
return -EINVAL;
if (max_discard > q->limits.max_hw_discard_sectors)
max_discard = q->limits.max_hw_discard_sectors;
blk_mq_freeze_queue(q);
lim = queue_limits_start_update(q);
lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
err = queue_limits_commit_update(q, &lim);
blk_mq_unfreeze_queue(q);
q->limits.max_discard_sectors = max_discard;
if (err)
return err;
return ret;
}
@ -226,35 +232,22 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
unsigned long var;
unsigned int max_sectors_kb,
max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
page_kb = 1 << (PAGE_SHIFT - 10);
ssize_t ret = queue_var_store(&var, page, count);
unsigned long max_sectors_kb;
struct queue_limits lim;
ssize_t ret;
int err;
ret = queue_var_store(&max_sectors_kb, page, count);
if (ret < 0)
return ret;
max_sectors_kb = (unsigned int)var;
max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb,
q->limits.max_dev_sectors >> 1);
if (max_sectors_kb == 0) {
q->limits.max_user_sectors = 0;
max_sectors_kb = min(max_hw_sectors_kb,
BLK_DEF_MAX_SECTORS_CAP >> 1);
} else {
if (max_sectors_kb > max_hw_sectors_kb ||
max_sectors_kb < page_kb)
return -EINVAL;
q->limits.max_user_sectors = max_sectors_kb << 1;
}
spin_lock_irq(&q->queue_lock);
q->limits.max_sectors = max_sectors_kb << 1;
if (q->disk)
q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
spin_unlock_irq(&q->queue_lock);
blk_mq_freeze_queue(q);
lim = queue_limits_start_update(q);
lim.max_user_sectors = max_sectors_kb << 1;
err = queue_limits_commit_update(q, &lim);
blk_mq_unfreeze_queue(q);
if (err)
return err;
return ret;
}

View File

@ -1098,7 +1098,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
tg_may_dispatch(tg, bio, NULL)) {
tg_dispatch_one_bio(tg, bio_data_dir(bio));
tg_dispatch_one_bio(tg, READ);
nr_reads++;
if (nr_reads >= max_nr_reads)
@ -1108,7 +1108,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
tg_may_dispatch(tg, bio, NULL)) {
tg_dispatch_one_bio(tg, bio_data_dir(bio));
tg_dispatch_one_bio(tg, WRITE);
nr_writes++;
if (nr_writes >= max_nr_writes)
@ -1815,7 +1815,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
ret = tg->latency_target == DFL_LATENCY_TARGET ||
tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
(ktime_get_ns() >> 10) - tg->last_finish_time > time ||
(blk_time_get_ns() >> 10) - tg->last_finish_time > time ||
tg->avg_idletime > tg->idletime_threshold ||
(tg->latency_target && tg->bio_cnt &&
tg->bad_bio_cnt * 5 < tg->bio_cnt);
@ -2060,7 +2060,7 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
if (last_finish_time == 0)
return;
now = ktime_get_ns() >> 10;
now = blk_time_get_ns() >> 10;
if (now <= last_finish_time ||
last_finish_time == tg->checked_last_finish_time)
return;
@ -2327,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio)
if (!tg->td->limit_valid[LIMIT_LOW])
return;
finish_time_ns = ktime_get_ns();
finish_time_ns = blk_time_get_ns();
tg->last_finish_time = finish_time_ns >> 10;
start_time = bio_issue_time(&bio->bi_issue) >> 10;

View File

@ -29,6 +29,7 @@
#include "blk-wbt.h"
#include "blk-rq-qos.h"
#include "elevator.h"
#include "blk.h"
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
@ -274,13 +275,12 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
{
u64 now, issue = READ_ONCE(rwb->sync_issue);
u64 issue = READ_ONCE(rwb->sync_issue);
if (!issue || !rwb->sync_cookie)
return 0;
now = ktime_to_ns(ktime_get());
return now - issue;
return blk_time_get_ns() - issue;
}
static inline unsigned int wbt_inflight(struct rq_wb *rwb)

View File

@ -11,7 +11,6 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/rbtree.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/mm.h>
@ -177,8 +176,7 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
}
}
static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
gfp_t gfp_mask)
static int blkdev_zone_reset_all_emulated(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
sector_t capacity = bdev_nr_sectors(bdev);
@ -205,7 +203,7 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
}
bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
gfp_mask);
GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
@ -223,7 +221,7 @@ out_free_need_reset:
return ret;
}
static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
static int blkdev_zone_reset_all(struct block_device *bdev)
{
struct bio bio;
@ -238,7 +236,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
* @sector: Start sector of the first zone to operate on
* @nr_sectors: Number of sectors, should be at least the length of one zone and
* must be zone size aligned.
* @gfp_mask: Memory allocation flags (for bio_alloc)
*
* Description:
* Perform the specified operation on the range of zones specified by
@ -248,7 +245,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
* or finish request.
*/
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
sector_t sector, sector_t nr_sectors, gfp_t gfp_mask)
sector_t sector, sector_t nr_sectors)
{
struct request_queue *q = bdev_get_queue(bdev);
sector_t zone_sectors = bdev_zone_sectors(bdev);
@ -285,12 +282,12 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
*/
if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
if (!blk_queue_zone_resetall(q))
return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
return blkdev_zone_reset_all(bdev, gfp_mask);
return blkdev_zone_reset_all_emulated(bdev);
return blkdev_zone_reset_all(bdev);
}
while (sector < end_sector) {
bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask);
bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
@ -419,8 +416,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
return -ENOTTY;
}
ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
GFP_KERNEL);
ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
fail:
if (cmd == BLKRESETZONE)

View File

@ -4,6 +4,8 @@
#include <linux/blk-crypto.h>
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
#include <linux/sched/sysctl.h>
#include <linux/timekeeping.h>
#include <xen/xen.h>
#include "blk-crypto-internal.h"
@ -70,6 +72,18 @@ static inline int bio_queue_enter(struct bio *bio)
return __bio_queue_enter(q, bio);
}
static inline void blk_wait_io(struct completion *done)
{
/* Prevent hang_check timer from firing at us during very long I/O */
unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
if (timeout)
while (!wait_for_completion_io_timeout(done, timeout))
;
else
wait_for_completion_io(done);
}
#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask);
@ -329,7 +343,7 @@ void blk_rq_set_mixed_merge(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
void blk_set_default_limits(struct queue_limits *lim);
int blk_set_default_limits(struct queue_limits *lim);
int blk_dev_init(void);
/*
@ -447,7 +461,7 @@ static inline void bio_release_page(struct bio *bio, struct page *page)
unpin_user_page(page);
}
struct request_queue *blk_alloc_queue(int node_id);
struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);
int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
@ -516,8 +530,75 @@ static inline int req_ref_read(struct request *req)
return atomic_read(&req->ref);
}
static inline u64 blk_time_get_ns(void)
{
struct blk_plug *plug = current->plug;
if (!plug)
return ktime_get_ns();
/*
* 0 could very well be a valid time, but rather than flag "this is
* a valid timestamp" separately, just accept that we'll do an extra
* ktime_get_ns() if we just happen to get 0 as the current time.
*/
if (!plug->cur_ktime) {
plug->cur_ktime = ktime_get_ns();
current->flags |= PF_BLOCK_TS;
}
return plug->cur_ktime;
}
static inline ktime_t blk_time_get(void)
{
return ns_to_ktime(blk_time_get_ns());
}
/*
* From most significant bit:
* 1 bit: reserved for other usage, see below
* 12 bits: original size of bio
* 51 bits: issue time of bio
*/
#define BIO_ISSUE_RES_BITS 1
#define BIO_ISSUE_SIZE_BITS 12
#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS)
#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
#define BIO_ISSUE_SIZE_MASK \
(((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
/* Reserved bit for blk-throtl */
#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
static inline u64 __bio_issue_time(u64 time)
{
return time & BIO_ISSUE_TIME_MASK;
}
static inline u64 bio_issue_time(struct bio_issue *issue)
{
return __bio_issue_time(issue->value);
}
static inline sector_t bio_issue_size(struct bio_issue *issue)
{
return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
}
static inline void bio_issue_init(struct bio_issue *issue,
sector_t size)
{
size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
(blk_time_get_ns() & BIO_ISSUE_TIME_MASK) |
((u64)size << BIO_ISSUE_SIZE_SHIFT));
}
void bdev_release(struct file *bdev_file);
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
const struct blk_holder_ops *hops, struct file *bdev_file);
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
#endif /* BLK_INTERNAL_H */

View File

@ -383,7 +383,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
if (blk_mq_alloc_tag_set(set))
goto out_tag_set;
q = blk_mq_init_queue(set);
q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(q)) {
ret = PTR_ERR(q);
goto out_queue;

View File

@ -1201,7 +1201,7 @@ static int block_uevent(const struct device *dev, struct kobj_uevent_env *env)
return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
}
struct class block_class = {
const struct class block_class = {
.name = "block",
.dev_uevent = block_uevent,
};
@ -1391,19 +1391,21 @@ out_free_disk:
return NULL;
}
struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
struct lock_class_key *lkclass)
{
struct queue_limits default_lim = { };
struct request_queue *q;
struct gendisk *disk;
q = blk_alloc_queue(node);
if (!q)
return NULL;
q = blk_alloc_queue(lim ? lim : &default_lim, node);
if (IS_ERR(q))
return ERR_CAST(q);
disk = __alloc_disk_node(q, node, lkclass);
if (!disk) {
blk_put_queue(q);
return NULL;
return ERR_PTR(-ENOMEM);
}
set_bit(GD_OWNS_QUEUE, &disk->state);
return disk;

View File

@ -8,6 +8,8 @@ struct bd_holder_disk {
int refcnt;
};
static DEFINE_MUTEX(blk_holder_mutex);
static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
struct gendisk *disk)
{
@ -80,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
kobject_get(bdev->bd_holder_dir);
mutex_unlock(&bdev->bd_disk->open_mutex);
mutex_lock(&disk->open_mutex);
mutex_lock(&blk_holder_mutex);
WARN_ON_ONCE(!bdev->bd_holder);
holder = bd_find_holder_disk(bdev, disk);
@ -108,7 +110,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
goto out_del_symlink;
list_add(&holder->list, &disk->slave_bdevs);
mutex_unlock(&disk->open_mutex);
mutex_unlock(&blk_holder_mutex);
return 0;
out_del_symlink:
@ -116,7 +118,7 @@ out_del_symlink:
out_free_holder:
kfree(holder);
out_unlock:
mutex_unlock(&disk->open_mutex);
mutex_unlock(&blk_holder_mutex);
if (ret)
kobject_put(bdev->bd_holder_dir);
return ret;
@ -140,7 +142,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
if (WARN_ON_ONCE(!disk->slave_dir))
return;
mutex_lock(&disk->open_mutex);
mutex_lock(&blk_holder_mutex);
holder = bd_find_holder_disk(bdev, disk);
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
del_symlink(disk->slave_dir, bdev_kobj(bdev));
@ -149,6 +151,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
list_del_init(&holder->list);
kfree(holder);
}
mutex_unlock(&disk->open_mutex);
mutex_unlock(&blk_holder_mutex);
}
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);

View File

@ -18,7 +18,7 @@ static int blkpg_do_ioctl(struct block_device *bdev,
{
struct gendisk *disk = bdev->bd_disk;
struct blkpg_partition p;
sector_t start, length;
sector_t start, length, capacity, end;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
@ -41,6 +41,13 @@ static int blkpg_do_ioctl(struct block_device *bdev,
start = p.start >> SECTOR_SHIFT;
length = p.length >> SECTOR_SHIFT;
capacity = get_capacity(disk);
if (check_add_overflow(start, length, &end))
return -EINVAL;
if (start >= capacity || end > capacity)
return -EINVAL;
switch (op) {
case BLKPG_ADD_PARTITION:

View File

@ -419,21 +419,10 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length)
{
sector_t capacity = get_capacity(disk), end;
struct block_device *part;
int ret;
mutex_lock(&disk->open_mutex);
if (check_add_overflow(start, length, &end)) {
ret = -EINVAL;
goto out;
}
if (start >= capacity || end > capacity) {
ret = -EINVAL;
goto out;
}
if (!disk_live(disk)) {
ret = -ENXIO;
goto out;

View File

@ -20,6 +20,7 @@ extern void note_bootable_part(dev_t dev, int part, int goodness);
* Code to understand MacOS partition tables.
*/
#ifdef CONFIG_PPC_PMAC
static inline void mac_fix_string(char *stg, int len)
{
int i;
@ -27,6 +28,7 @@ static inline void mac_fix_string(char *stg, int len)
for (i = len - 1; i >= 0 && stg[i] == ' '; i--)
stg[i] = 0;
}
#endif
int mac_partition(struct parsed_partitions *state)
{

View File

@ -1212,7 +1212,7 @@ static int cmd_start(struct opal_dev *dev, const u8 *uid, const u8 *method)
static int start_opal_session_cont(struct opal_dev *dev)
{
u32 hsn, tsn;
int error = 0;
int error;
error = parse_and_check_status(dev);
if (error)
@ -1354,7 +1354,7 @@ static int get_active_key_cont(struct opal_dev *dev)
{
const char *activekey;
size_t keylen;
int error = 0;
int error;
error = parse_and_check_status(dev);
if (error)
@ -2157,7 +2157,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
u8 lr_buffer[OPAL_UID_LENGTH];
struct opal_lock_unlock *lkul = data;
u8 read_locked = 1, write_locked = 1;
int err = 0;
int err;
if (build_locking_range(lr_buffer, sizeof(lr_buffer),
lkul->session.opal_key.lr) < 0)
@ -2580,7 +2580,7 @@ static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv)
const struct opal_step discovery0_step = {
opal_discovery0, discv
};
int ret = 0;
int ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
@ -3069,7 +3069,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
{
struct opal_suspend_data *suspend;
bool was_failure = false;
int ret = 0;
int ret;
if (!dev)
return false;
@ -3112,10 +3112,9 @@ static int opal_read_table(struct opal_dev *dev,
{ read_table_data, rw_tbl },
{ end_opal_session, }
};
int ret = 0;
if (!rw_tbl->size)
return ret;
return 0;
return execute_steps(dev, read_table_steps,
ARRAY_SIZE(read_table_steps));
@ -3129,10 +3128,9 @@ static int opal_write_table(struct opal_dev *dev,
{ write_table_data, rw_tbl },
{ end_opal_session, }
};
int ret = 0;
if (!rw_tbl->size)
return ret;
return 0;
return execute_steps(dev, write_table_steps,
ARRAY_SIZE(write_table_steps));

View File

@ -12,14 +12,14 @@
#include <net/checksum.h>
#include <asm/unaligned.h>
typedef __be16 (csum_fn) (void *, unsigned int);
typedef __be16 (csum_fn) (__be16, void *, unsigned int);
static __be16 t10_pi_crc_fn(void *data, unsigned int len)
static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len)
{
return cpu_to_be16(crc_t10dif(data, len));
return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len));
}
static __be16 t10_pi_ip_fn(void *data, unsigned int len)
static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len)
{
return (__force __be16)ip_compute_csum(data, len);
}
@ -32,12 +32,16 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len)
static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
csum_fn *fn, enum t10_dif_type type)
{
u8 offset = iter->pi_offset;
unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf;
struct t10_pi_tuple *pi = iter->prot_buf + offset;
pi->guard_tag = fn(iter->data_buf, iter->interval);
pi->guard_tag = fn(0, iter->data_buf, iter->interval);
if (offset)
pi->guard_tag = fn(pi->guard_tag, iter->prot_buf,
offset);
pi->app_tag = 0;
if (type == T10_PI_TYPE1_PROTECTION)
@ -56,12 +60,13 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
csum_fn *fn, enum t10_dif_type type)
{
u8 offset = iter->pi_offset;
unsigned int i;
BUG_ON(type == T10_PI_TYPE0_PROTECTION);
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf;
struct t10_pi_tuple *pi = iter->prot_buf + offset;
__be16 csum;
if (type == T10_PI_TYPE1_PROTECTION ||
@ -83,7 +88,9 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
goto next;
}
csum = fn(iter->data_buf, iter->interval);
csum = fn(0, iter->data_buf, iter->interval);
if (offset)
csum = fn(csum, iter->prot_buf, offset);
if (pi->guard_tag != csum) {
pr_err("%s: guard tag error at sector %llu " \
@ -134,8 +141,10 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
*/
static void t10_pi_type1_prepare(struct request *rq)
{
const int tuple_sz = rq->q->integrity.tuple_size;
struct blk_integrity *bi = &rq->q->integrity;
const int tuple_sz = bi->tuple_size;
u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
__rq_for_each_bio(bio, rq) {
@ -154,7 +163,7 @@ static void t10_pi_type1_prepare(struct request *rq)
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len; j += tuple_sz) {
struct t10_pi_tuple *pi = p;
struct t10_pi_tuple *pi = p + offset;
if (be32_to_cpu(pi->ref_tag) == virt)
pi->ref_tag = cpu_to_be32(ref_tag);
@ -183,9 +192,11 @@ static void t10_pi_type1_prepare(struct request *rq)
*/
static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp;
const int tuple_sz = rq->q->integrity.tuple_size;
struct blk_integrity *bi = &rq->q->integrity;
unsigned intervals = nr_bytes >> bi->interval_exp;
const int tuple_sz = bi->tuple_size;
u32 ref_tag = t10_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
__rq_for_each_bio(bio, rq) {
@ -200,7 +211,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
struct t10_pi_tuple *pi = p;
struct t10_pi_tuple *pi = p + offset;
if (be32_to_cpu(pi->ref_tag) == ref_tag)
pi->ref_tag = cpu_to_be32(virt);
@ -280,20 +291,24 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
};
EXPORT_SYMBOL(t10_pi_type3_ip);
static __be64 ext_pi_crc64(void *data, unsigned int len)
static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len)
{
return cpu_to_be64(crc64_rocksoft(data, len));
return cpu_to_be64(crc64_rocksoft_update(crc, data, len));
}
static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter,
enum t10_dif_type type)
{
u8 offset = iter->pi_offset;
unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct crc64_pi_tuple *pi = iter->prot_buf;
struct crc64_pi_tuple *pi = iter->prot_buf + offset;
pi->guard_tag = ext_pi_crc64(iter->data_buf, iter->interval);
pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval);
if (offset)
pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag),
iter->prot_buf, offset);
pi->app_tag = 0;
if (type == T10_PI_TYPE1_PROTECTION)
@ -319,10 +334,11 @@ static bool ext_pi_ref_escape(u8 *ref_tag)
static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
enum t10_dif_type type)
{
u8 offset = iter->pi_offset;
unsigned int i;
for (i = 0; i < iter->data_size; i += iter->interval) {
struct crc64_pi_tuple *pi = iter->prot_buf;
struct crc64_pi_tuple *pi = iter->prot_buf + offset;
u64 ref, seed;
__be64 csum;
@ -343,7 +359,11 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
goto next;
}
csum = ext_pi_crc64(iter->data_buf, iter->interval);
csum = ext_pi_crc64(0, iter->data_buf, iter->interval);
if (offset)
csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf,
offset);
if (pi->guard_tag != csum) {
pr_err("%s: guard tag error at sector %llu " \
"(rcvd %016llx, want %016llx)\n",
@ -373,8 +393,10 @@ static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter)
static void ext_pi_type1_prepare(struct request *rq)
{
const int tuple_sz = rq->q->integrity.tuple_size;
struct blk_integrity *bi = &rq->q->integrity;
const int tuple_sz = bi->tuple_size;
u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
__rq_for_each_bio(bio, rq) {
@ -393,7 +415,7 @@ static void ext_pi_type1_prepare(struct request *rq)
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len; j += tuple_sz) {
struct crc64_pi_tuple *pi = p;
struct crc64_pi_tuple *pi = p + offset;
u64 ref = get_unaligned_be48(pi->ref_tag);
if (ref == virt)
@ -411,9 +433,11 @@ static void ext_pi_type1_prepare(struct request *rq)
static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp;
const int tuple_sz = rq->q->integrity.tuple_size;
struct blk_integrity *bi = &rq->q->integrity;
unsigned intervals = nr_bytes >> bi->interval_exp;
const int tuple_sz = bi->tuple_size;
u64 ref_tag = ext_pi_ref_tag(rq);
u8 offset = bi->pi_offset;
struct bio *bio;
__rq_for_each_bio(bio, rq) {
@ -428,7 +452,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
struct crc64_pi_tuple *pi = p;
struct crc64_pi_tuple *pi = p + offset;
u64 ref = get_unaligned_be48(pi->ref_tag);
if (ref == ref_tag)

View File

@ -207,7 +207,7 @@ static inline int devtmpfs_init(void) { return 0; }
#endif
#ifdef CONFIG_BLOCK
extern struct class block_class;
extern const struct class block_class;
static inline bool is_blockdev(struct device *dev)
{
return dev->class == &block_class;

View File

@ -1779,7 +1779,7 @@ static int fd_alloc_disk(int drive, int system)
struct gendisk *disk;
int err;
disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL);
disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL);
if (IS_ERR(disk))
return PTR_ERR(disk);

View File

@ -24,8 +24,8 @@ static DEFINE_MUTEX(aoeblk_mutex);
static struct kmem_cache *buf_pool_cache;
static struct dentry *aoe_debugfs_dir;
/* GPFS needs a larger value than the default. */
static int aoe_maxsectors;
/* random default picked from the historic block max_sectors cap */
static int aoe_maxsectors = 2560;
module_param(aoe_maxsectors, int, 0644);
MODULE_PARM_DESC(aoe_maxsectors,
"When nonzero, set the maximum number of sectors per I/O request");
@ -334,6 +334,10 @@ aoeblk_gdalloc(void *vp)
mempool_t *mp;
struct blk_mq_tag_set *set;
sector_t ssize;
struct queue_limits lim = {
.max_hw_sectors = aoe_maxsectors,
.io_opt = SZ_2M,
};
ulong flags;
int late = 0;
int err;
@ -371,7 +375,7 @@ aoeblk_gdalloc(void *vp)
goto err_mempool;
}
gd = blk_mq_alloc_disk(set, d);
gd = blk_mq_alloc_disk(set, &lim, d);
if (IS_ERR(gd)) {
pr_err("aoe: cannot allocate block queue for %ld.%d\n",
d->aoemajor, d->aoeminor);
@ -384,14 +388,9 @@ aoeblk_gdalloc(void *vp)
WARN_ON(d->flags & DEVFL_TKILL);
WARN_ON(d->gd);
WARN_ON(d->flags & DEVFL_UP);
/* random number picked from the history block max_sectors cap */
blk_queue_max_hw_sectors(gd->queue, 2560u);
blk_queue_io_opt(gd->queue, SZ_2M);
d->bufpool = mp;
d->blkq = gd->queue;
d->gd = gd;
if (aoe_maxsectors)
blk_queue_max_hw_sectors(gd->queue, aoe_maxsectors);
gd->major = AOE_MAJOR;
gd->first_minor = d->sysminor;
gd->minors = AOE_PARTITIONS;

View File

@ -419,13 +419,16 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
rcu_read_lock();
for_each_netdev_rcu(&init_net, ifp) {
dev_hold(ifp);
if (!is_aoe_netif(ifp))
goto cont;
if (!is_aoe_netif(ifp)) {
dev_put(ifp);
continue;
}
skb = new_skb(sizeof *h + sizeof *ch);
if (skb == NULL) {
printk(KERN_INFO "aoe: skb alloc failure\n");
goto cont;
dev_put(ifp);
continue;
}
skb_put(skb, sizeof *h + sizeof *ch);
skb->dev = ifp;
@ -440,9 +443,6 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
h->major = cpu_to_be16(aoemajor);
h->minor = aoeminor;
h->cmd = AOECMD_CFG;
cont:
dev_put(ifp);
}
rcu_read_unlock();
}

View File

@ -63,6 +63,7 @@ tx(int id) __must_hold(&txlock)
pr_warn("aoe: packet could not be sent on %s. %s\n",
ifp ? ifp->name : "netif",
"consider increasing tx_queue_len");
dev_put(ifp);
spin_lock_irq(&txlock);
}
return 0;

View File

@ -1994,7 +1994,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type)
{
struct gendisk *disk;
disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL);
disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL);
if (IS_ERR(disk))
return PTR_ERR(disk);

View File

@ -318,6 +318,16 @@ static int brd_alloc(int i)
struct gendisk *disk;
char buf[DISK_NAME_LEN];
int err = -ENOMEM;
struct queue_limits lim = {
/*
* This is so fdisk will align partitions on 4k, because of
* direct_access API needing 4k alignment, returning a PFN
* (This is only a problem on very small devices <= 4M,
* otherwise fdisk will align on 1M. Regardless this call
* is harmless)
*/
.physical_block_size = PAGE_SIZE,
};
list_for_each_entry(brd, &brd_devices, brd_list)
if (brd->brd_number == i)
@ -335,10 +345,11 @@ static int brd_alloc(int i)
debugfs_create_u64(buf, 0444, brd_debugfs_dir,
&brd->brd_nr_pages);
disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE);
if (!disk)
disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_free_dev;
}
disk->major = RAMDISK_MAJOR;
disk->first_minor = i * max_part;
disk->minors = max_part;
@ -347,15 +358,6 @@ static int brd_alloc(int i)
strscpy(disk->disk_name, buf, DISK_NAME_LEN);
set_capacity(disk, rd_size * 2);
/*
* This is so fdisk will align partitions on 4k, because of
* direct_access API needing 4k alignment, returning a PFN
* (This is only a problem on very small devices <= 4M,
* otherwise fdisk will align on 1M. Regardless this call
* is harmless)
*/
blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
/* Tell the block layer that this is not a rotational device */
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);

View File

@ -2690,6 +2690,14 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
int id;
int vnr = adm_ctx->volume;
enum drbd_ret_code err = ERR_NOMEM;
struct queue_limits lim = {
/*
* Setting the max_hw_sectors to an odd value of 8kibyte here.
* This triggers a max_bio_size message upon first attach or
* connect.
*/
.max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8,
};
device = minor_to_device(minor);
if (device)
@ -2708,9 +2716,11 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
drbd_init_set_defaults(device);
disk = blk_alloc_disk(NUMA_NO_NODE);
if (!disk)
disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_no_disk;
}
device->vdisk = disk;
device->rq_queue = disk->queue;
@ -2727,9 +2737,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
blk_queue_write_cache(disk->queue, true, true);
/* Setting the max_hw_sectors to an odd value of 8kibyte here
This triggers a max_bio_size message upon first attach or connect */
blk_queue_max_hw_sectors(disk->queue, DRBD_MAX_BIO_SIZE_SAFE >> 8);
device->md_io.page = alloc_page(GFP_KERNEL);
if (!device->md_io.page)

View File

@ -1189,9 +1189,31 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
return 0;
}
static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity)
static unsigned int drbd_max_peer_bio_size(struct drbd_device *device)
{
q->limits.discard_granularity = granularity;
/*
* We may ignore peer limits if the peer is modern enough. From 8.3.8
* onwards the peer can use multiple BIOs for a single peer_request.
*/
if (device->state.conn < C_WF_REPORT_PARAMS)
return device->peer_max_bio_size;
if (first_peer_device(device)->connection->agreed_pro_version < 94)
return min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
/*
* Correct old drbd (up to 8.3.7) if it believes it can do more than
* 32KiB.
*/
if (first_peer_device(device)->connection->agreed_pro_version == 94)
return DRBD_MAX_SIZE_H80_PACKET;
/*
* drbd 8.3.8 onwards, before 8.4.0
*/
if (first_peer_device(device)->connection->agreed_pro_version < 100)
return DRBD_MAX_BIO_SIZE_P95;
return DRBD_MAX_BIO_SIZE;
}
static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
@ -1204,24 +1226,81 @@ static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
return AL_EXTENT_SIZE >> 9;
}
static void decide_on_discard_support(struct drbd_device *device,
static bool drbd_discard_supported(struct drbd_connection *connection,
struct drbd_backing_dev *bdev)
{
struct drbd_connection *connection =
first_peer_device(device)->connection;
struct request_queue *q = device->rq_queue;
unsigned int max_discard_sectors;
if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev))
goto not_supported;
return false;
if (connection->cstate >= C_CONNECTED &&
!(connection->agreed_features & DRBD_FF_TRIM)) {
drbd_info(connection,
"peer DRBD too old, does not support TRIM: disabling discards\n");
goto not_supported;
return false;
}
return true;
}
/* This is the workaround for "bio would need to, but cannot, be split" */
static unsigned int drbd_backing_dev_max_segments(struct drbd_device *device)
{
unsigned int max_segments;
rcu_read_lock();
max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
rcu_read_unlock();
if (!max_segments)
return BLK_MAX_SEGMENTS;
return max_segments;
}
void drbd_reconsider_queue_parameters(struct drbd_device *device,
struct drbd_backing_dev *bdev, struct o_qlim *o)
{
struct drbd_connection *connection =
first_peer_device(device)->connection;
struct request_queue * const q = device->rq_queue;
unsigned int now = queue_max_hw_sectors(q) << 9;
struct queue_limits lim;
struct request_queue *b = NULL;
unsigned int new;
if (bdev) {
b = bdev->backing_bdev->bd_disk->queue;
device->local_max_bio_size =
queue_max_hw_sectors(b) << SECTOR_SHIFT;
}
/*
* We may later detach and re-attach on a disconnected Primary. Avoid
* decreasing the value in this case.
*
* We want to store what we know the peer DRBD can handle, not what the
* peer IO backend can handle.
*/
new = min3(DRBD_MAX_BIO_SIZE, device->local_max_bio_size,
max(drbd_max_peer_bio_size(device), device->peer_max_bio_size));
if (new != now) {
if (device->state.role == R_PRIMARY && new < now)
drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n",
new, now);
drbd_info(device, "max BIO size = %u\n", new);
}
lim = queue_limits_start_update(q);
if (bdev) {
blk_set_stacking_limits(&lim);
lim.max_segments = drbd_backing_dev_max_segments(device);
} else {
lim.max_segments = BLK_MAX_SEGMENTS;
}
lim.max_hw_sectors = new >> SECTOR_SHIFT;
lim.seg_boundary_mask = PAGE_SIZE - 1;
/*
* We don't care for the granularity, really.
*
@ -1230,123 +1309,36 @@ static void decide_on_discard_support(struct drbd_device *device,
* problem, really. If you care, you need to use devices with similar
* topology on all peers.
*/
blk_queue_discard_granularity(q, 512);
max_discard_sectors = drbd_max_discard_sectors(connection);
blk_queue_max_discard_sectors(q, max_discard_sectors);
blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
return;
if (drbd_discard_supported(connection, bdev)) {
lim.discard_granularity = 512;
lim.max_hw_discard_sectors =
drbd_max_discard_sectors(connection);
} else {
lim.discard_granularity = 0;
lim.max_hw_discard_sectors = 0;
}
not_supported:
blk_queue_discard_granularity(q, 0);
blk_queue_max_discard_sectors(q, 0);
}
if (bdev)
blk_stack_limits(&lim, &b->limits, 0);
static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q)
{
/* Fixup max_write_zeroes_sectors after blk_stack_limits():
* if we can handle "zeroes" efficiently on the protocol,
* we want to do that, even if our backend does not announce
* max_write_zeroes_sectors itself. */
struct drbd_connection *connection = first_peer_device(device)->connection;
/* If the peer announces WZEROES support, use it. Otherwise, rather
* send explicit zeroes than rely on some discard-zeroes-data magic. */
/*
* If we can handle "zeroes" efficiently on the protocol, we want to do
* that, even if our backend does not announce max_write_zeroes_sectors
* itself.
*/
if (connection->agreed_features & DRBD_FF_WZEROES)
q->limits.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS;
lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS;
else
q->limits.max_write_zeroes_sectors = 0;
}
lim.max_write_zeroes_sectors = 0;
static void fixup_discard_support(struct drbd_device *device, struct request_queue *q)
{
unsigned int max_discard = device->rq_queue->limits.max_discard_sectors;
unsigned int discard_granularity =
device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT;
if (discard_granularity > max_discard) {
blk_queue_discard_granularity(q, 0);
blk_queue_max_discard_sectors(q, 0);
}
}
static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
unsigned int max_bio_size, struct o_qlim *o)
{
struct request_queue * const q = device->rq_queue;
unsigned int max_hw_sectors = max_bio_size >> 9;
unsigned int max_segments = 0;
struct request_queue *b = NULL;
struct disk_conf *dc;
if (bdev) {
b = bdev->backing_bdev->bd_disk->queue;
max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
rcu_read_lock();
dc = rcu_dereference(device->ldev->disk_conf);
max_segments = dc->max_bio_bvecs;
rcu_read_unlock();
blk_set_stacking_limits(&q->limits);
if ((lim.discard_granularity >> SECTOR_SHIFT) >
lim.max_hw_discard_sectors) {
lim.discard_granularity = 0;
lim.max_hw_discard_sectors = 0;
}
blk_queue_max_hw_sectors(q, max_hw_sectors);
/* This is the workaround for "bio would need to, but cannot, be split" */
blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
blk_queue_segment_boundary(q, PAGE_SIZE-1);
decide_on_discard_support(device, bdev);
if (b) {
blk_stack_limits(&q->limits, &b->limits, 0);
disk_update_readahead(device->vdisk);
}
fixup_write_zeroes(device, q);
fixup_discard_support(device, q);
}
void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
{
unsigned int now, new, local, peer;
now = queue_max_hw_sectors(device->rq_queue) << 9;
local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
if (bdev) {
local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
device->local_max_bio_size = local;
}
local = min(local, DRBD_MAX_BIO_SIZE);
/* We may ignore peer limits if the peer is modern enough.
Because new from 8.3.8 onwards the peer can use multiple
BIOs for a single peer_request */
if (device->state.conn >= C_WF_REPORT_PARAMS) {
if (first_peer_device(device)->connection->agreed_pro_version < 94)
peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
else if (first_peer_device(device)->connection->agreed_pro_version == 94)
peer = DRBD_MAX_SIZE_H80_PACKET;
else if (first_peer_device(device)->connection->agreed_pro_version < 100)
peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */
else
peer = DRBD_MAX_BIO_SIZE;
/* We may later detach and re-attach on a disconnected Primary.
* Avoid this setting to jump back in that case.
* We want to store what we know the peer DRBD can handle,
* not what the peer IO backend can handle. */
if (peer > device->peer_max_bio_size)
device->peer_max_bio_size = peer;
}
new = min(local, peer);
if (device->state.role == R_PRIMARY && new < now)
drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
if (new != now)
drbd_info(device, "max BIO size = %u\n", new);
drbd_setup_queue_param(device, bdev, new, o);
if (queue_limits_commit_update(q, &lim))
drbd_err(device, "setting new queue limits failed\n");
}
/* Starts the worker thread */

View File

@ -1542,9 +1542,10 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
int notify_resource_state_change(struct sk_buff *skb,
unsigned int seq,
struct drbd_resource_state_change *resource_state_change,
void *state_change,
enum drbd_notification_type type)
{
struct drbd_resource_state_change *resource_state_change = state_change;
struct drbd_resource *resource = resource_state_change->resource;
struct resource_info resource_info = {
.res_role = resource_state_change->role[NEW],
@ -1558,13 +1559,14 @@ int notify_resource_state_change(struct sk_buff *skb,
int notify_connection_state_change(struct sk_buff *skb,
unsigned int seq,
struct drbd_connection_state_change *connection_state_change,
void *state_change,
enum drbd_notification_type type)
{
struct drbd_connection *connection = connection_state_change->connection;
struct drbd_connection_state_change *p = state_change;
struct drbd_connection *connection = p->connection;
struct connection_info connection_info = {
.conn_connection_state = connection_state_change->cstate[NEW],
.conn_role = connection_state_change->peer_role[NEW],
.conn_connection_state = p->cstate[NEW],
.conn_role = p->peer_role[NEW],
};
return notify_connection_state(skb, seq, connection, &connection_info, type);
@ -1572,9 +1574,10 @@ int notify_connection_state_change(struct sk_buff *skb,
int notify_device_state_change(struct sk_buff *skb,
unsigned int seq,
struct drbd_device_state_change *device_state_change,
void *state_change,
enum drbd_notification_type type)
{
struct drbd_device_state_change *device_state_change = state_change;
struct drbd_device *device = device_state_change->device;
struct device_info device_info = {
.dev_disk_state = device_state_change->disk_state[NEW],
@ -1585,9 +1588,10 @@ int notify_device_state_change(struct sk_buff *skb,
int notify_peer_device_state_change(struct sk_buff *skb,
unsigned int seq,
struct drbd_peer_device_state_change *p,
void *state_change,
enum drbd_notification_type type)
{
struct drbd_peer_device_state_change *p = state_change;
struct drbd_peer_device *peer_device = p->peer_device;
struct peer_device_info peer_device_info = {
.peer_repl_state = p->repl_state[NEW],
@ -1605,8 +1609,8 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
bool resource_state_has_changed;
unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
int (*last_func)(struct sk_buff *, unsigned int, void *,
enum drbd_notification_type) = NULL;
int (*last_func)(struct sk_buff *, unsigned int,
void *, enum drbd_notification_type) = NULL;
void *last_arg = NULL;
#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
@ -1616,7 +1620,7 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
})
#define REMEMBER_STATE_CHANGE(func, arg, type) \
({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
last_func = (typeof(last_func))func; \
last_func = func; \
last_arg = arg; \
})

View File

@ -46,19 +46,19 @@ extern void forget_state_change(struct drbd_state_change *);
extern int notify_resource_state_change(struct sk_buff *,
unsigned int,
struct drbd_resource_state_change *,
void *,
enum drbd_notification_type type);
extern int notify_connection_state_change(struct sk_buff *,
unsigned int,
struct drbd_connection_state_change *,
void *,
enum drbd_notification_type type);
extern int notify_device_state_change(struct sk_buff *,
unsigned int,
struct drbd_device_state_change *,
void *,
enum drbd_notification_type type);
extern int notify_peer_device_state_change(struct sk_buff *,
unsigned int,
struct drbd_peer_device_state_change *,
void *,
enum drbd_notification_type type);
#endif /* DRBD_STATE_CHANGE_H */

View File

@ -530,14 +530,13 @@ static struct format_descr format_req;
static char *floppy_track_buffer;
static int max_buffer_sectors;
typedef void (*done_f)(int);
static const struct cont_t {
void (*interrupt)(void);
/* this is called after the interrupt of the
* main command */
void (*redo)(void); /* this is called to retry the operation */
void (*error)(void); /* this is called to tally an error */
done_f done; /* this is called to say if the operation has
void (*done)(int); /* this is called to say if the operation has
* succeeded/failed */
} *cont;
@ -985,6 +984,10 @@ static void empty(void)
{
}
static void empty_done(int result)
{
}
static void (*floppy_work_fn)(void);
static void floppy_work_workfn(struct work_struct *work)
@ -1998,14 +2001,14 @@ static const struct cont_t wakeup_cont = {
.interrupt = empty,
.redo = do_wakeup,
.error = empty,
.done = (done_f)empty
.done = empty_done,
};
static const struct cont_t intr_cont = {
.interrupt = empty,
.redo = process_fd_request,
.error = empty,
.done = (done_f)empty
.done = empty_done,
};
/* schedules handler, waiting for completion. May be interrupted, will then
@ -4513,13 +4516,15 @@ static bool floppy_available(int drive)
static int floppy_alloc_disk(unsigned int drive, unsigned int type)
{
struct queue_limits lim = {
.max_hw_sectors = 64,
};
struct gendisk *disk;
disk = blk_mq_alloc_disk(&tag_sets[drive], NULL);
disk = blk_mq_alloc_disk(&tag_sets[drive], &lim, NULL);
if (IS_ERR(disk))
return PTR_ERR(disk);
blk_queue_max_hw_sectors(disk->queue, 64);
disk->major = FLOPPY_MAJOR;
disk->first_minor = TOMINOR(drive) | (type << 2);
disk->minors = 1;

View File

@ -750,12 +750,13 @@ static void loop_sysfs_exit(struct loop_device *lo)
&loop_attribute_group);
}
static void loop_config_discard(struct loop_device *lo)
static void loop_config_discard(struct loop_device *lo,
struct queue_limits *lim)
{
struct file *file = lo->lo_backing_file;
struct inode *inode = file->f_mapping->host;
struct request_queue *q = lo->lo_queue;
u32 granularity, max_discard_sectors;
u32 granularity = 0, max_discard_sectors = 0;
struct kstatfs sbuf;
/*
* If the backing device is a block device, mirror its zeroing
@ -775,29 +776,17 @@ static void loop_config_discard(struct loop_device *lo)
* We use punch hole to reclaim the free space used by the
* image a.k.a. discard.
*/
} else if (!file->f_op->fallocate) {
max_discard_sectors = 0;
granularity = 0;
} else {
struct kstatfs sbuf;
} else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) {
max_discard_sectors = UINT_MAX >> 9;
if (!vfs_statfs(&file->f_path, &sbuf))
granularity = sbuf.f_bsize;
else
max_discard_sectors = 0;
granularity = sbuf.f_bsize;
}
if (max_discard_sectors) {
q->limits.discard_granularity = granularity;
blk_queue_max_discard_sectors(q, max_discard_sectors);
blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
} else {
q->limits.discard_granularity = 0;
blk_queue_max_discard_sectors(q, 0);
blk_queue_max_write_zeroes_sectors(q, 0);
}
lim->max_hw_discard_sectors = max_discard_sectors;
lim->max_write_zeroes_sectors = max_discard_sectors;
if (max_discard_sectors)
lim->discard_granularity = granularity;
else
lim->discard_granularity = 0;
}
struct loop_worker {
@ -986,6 +975,20 @@ loop_set_status_from_info(struct loop_device *lo,
return 0;
}
static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize,
bool update_discard_settings)
{
struct queue_limits lim;
lim = queue_limits_start_update(lo->lo_queue);
lim.logical_block_size = bsize;
lim.physical_block_size = bsize;
lim.io_min = bsize;
if (update_discard_settings)
loop_config_discard(lo, &lim);
return queue_limits_commit_update(lo->lo_queue, &lim);
}
static int loop_configure(struct loop_device *lo, blk_mode_t mode,
struct block_device *bdev,
const struct loop_config *config)
@ -1083,11 +1086,10 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
else
bsize = 512;
blk_queue_logical_block_size(lo->lo_queue, bsize);
blk_queue_physical_block_size(lo->lo_queue, bsize);
blk_queue_io_min(lo->lo_queue, bsize);
error = loop_reconfigure_limits(lo, bsize, true);
if (WARN_ON_ONCE(error))
goto out_unlock;
loop_config_discard(lo);
loop_update_rotational(lo);
loop_update_dio(lo);
loop_sysfs_init(lo);
@ -1154,9 +1156,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
lo->lo_offset = 0;
lo->lo_sizelimit = 0;
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
blk_queue_logical_block_size(lo->lo_queue, 512);
blk_queue_physical_block_size(lo->lo_queue, 512);
blk_queue_io_min(lo->lo_queue, 512);
loop_reconfigure_limits(lo, 512, false);
invalidate_disk(lo->lo_disk);
loop_sysfs_exit(lo);
/* let user-space know about this change */
@ -1488,9 +1488,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
invalidate_bdev(lo->lo_device);
blk_mq_freeze_queue(lo->lo_queue);
blk_queue_logical_block_size(lo->lo_queue, arg);
blk_queue_physical_block_size(lo->lo_queue, arg);
blk_queue_io_min(lo->lo_queue, arg);
err = loop_reconfigure_limits(lo, arg, false);
loop_update_dio(lo);
blk_mq_unfreeze_queue(lo->lo_queue);
@ -1982,6 +1980,12 @@ static const struct blk_mq_ops loop_mq_ops = {
static int loop_add(int i)
{
struct queue_limits lim = {
/*
* Random number picked from the historic block max_sectors cap.
*/
.max_hw_sectors = 2560u,
};
struct loop_device *lo;
struct gendisk *disk;
int err;
@ -2025,16 +2029,13 @@ static int loop_add(int i)
if (err)
goto out_free_idr;
disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo);
disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_cleanup_tags;
}
lo->lo_queue = lo->lo_disk->queue;
/* random number picked from the history block max_sectors cap */
blk_queue_max_hw_sectors(lo->lo_queue, 2560u);
/*
* By default, we do buffer IO, so it doesn't make sense to enable
* merge because the I/O submitted to backing file is handled page by

View File

@ -3401,6 +3401,12 @@ static const struct blk_mq_ops mtip_mq_ops = {
*/
static int mtip_block_initialize(struct driver_data *dd)
{
struct queue_limits lim = {
.physical_block_size = 4096,
.max_hw_sectors = 0xffff,
.max_segments = MTIP_MAX_SG,
.max_segment_size = 0x400000,
};
int rv = 0, wait_for_rebuild = 0;
sector_t capacity;
unsigned int index = 0;
@ -3431,7 +3437,7 @@ static int mtip_block_initialize(struct driver_data *dd)
goto block_queue_alloc_tag_error;
}
dd->disk = blk_mq_alloc_disk(&dd->tags, dd);
dd->disk = blk_mq_alloc_disk(&dd->tags, &lim, dd);
if (IS_ERR(dd->disk)) {
dev_err(&dd->pdev->dev,
"Unable to allocate request queue\n");
@ -3481,12 +3487,7 @@ skip_create_disk:
/* Set device limits. */
blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue);
blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
blk_queue_physical_block_size(dd->queue, 4096);
blk_queue_max_hw_sectors(dd->queue, 0xffff);
blk_queue_max_segment_size(dd->queue, 0x400000);
dma_set_max_seg_size(&dd->pdev->dev, 0x400000);
blk_queue_io_min(dd->queue, 4096);
/* Set the capacity of the device in 512 byte sectors. */
if (!(mtip_hw_get_capacity(dd, &capacity))) {

View File

@ -114,6 +114,10 @@ static const struct block_device_operations n64cart_fops = {
*/
static int __init n64cart_probe(struct platform_device *pdev)
{
struct queue_limits lim = {
.physical_block_size = 4096,
.logical_block_size = 4096,
};
struct gendisk *disk;
int err = -ENOMEM;
@ -131,9 +135,11 @@ static int __init n64cart_probe(struct platform_device *pdev)
if (IS_ERR(reg_base))
return PTR_ERR(reg_base);
disk = blk_alloc_disk(NUMA_NO_NODE);
if (!disk)
disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out;
}
disk->first_minor = 0;
disk->flags = GENHD_FL_NO_PART;
@ -145,8 +151,6 @@ static int __init n64cart_probe(struct platform_device *pdev)
set_disk_ro(disk, 1);
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_physical_block_size(disk->queue, 4096);
blk_queue_logical_block_size(disk->queue, 4096);
err = add_disk(disk);
if (err)

View File

@ -316,9 +316,12 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
nsock->sent = 0;
}
static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
loff_t blksize)
{
struct queue_limits lim;
int error;
if (!blksize)
blksize = 1u << NBD_DEF_BLKSIZE_BITS;
@ -334,10 +337,16 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
if (!nbd->pid)
return 0;
lim = queue_limits_start_update(nbd->disk->queue);
if (nbd->config->flags & NBD_FLAG_SEND_TRIM)
blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
blk_queue_logical_block_size(nbd->disk->queue, blksize);
blk_queue_physical_block_size(nbd->disk->queue, blksize);
lim.max_hw_discard_sectors = UINT_MAX;
else
lim.max_hw_discard_sectors = 0;
lim.logical_block_size = blksize;
lim.physical_block_size = blksize;
error = queue_limits_commit_update(nbd->disk->queue, &lim);
if (error)
return error;
if (max_part)
set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
@ -346,6 +355,18 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
return 0;
}
static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
loff_t blksize)
{
int error;
blk_mq_freeze_queue(nbd->disk->queue);
error = __nbd_set_size(nbd, bytesize, blksize);
blk_mq_unfreeze_queue(nbd->disk->queue);
return error;
}
static void nbd_complete_rq(struct request *req)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
@ -1351,7 +1372,6 @@ static void nbd_config_put(struct nbd_device *nbd)
nbd->config = NULL;
nbd->tag_set.timeout = 0;
blk_queue_max_discard_sectors(nbd->disk->queue, 0);
mutex_unlock(&nbd->config_lock);
nbd_put(nbd);
@ -1783,6 +1803,12 @@ static const struct blk_mq_ops nbd_mq_ops = {
static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
{
struct queue_limits lim = {
.max_hw_sectors = 65536,
.max_user_sectors = 256,
.max_segments = USHRT_MAX,
.max_segment_size = UINT_MAX,
};
struct nbd_device *nbd;
struct gendisk *disk;
int err = -ENOMEM;
@ -1823,7 +1849,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
if (err < 0)
goto out_free_tags;
disk = blk_mq_alloc_disk(&nbd->tag_set, NULL);
disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_free_idr;
@ -1843,11 +1869,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
* Tell the block layer that we are not a rotational device
*/
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_max_discard_sectors(disk->queue, 0);
blk_queue_max_segment_size(disk->queue, UINT_MAX);
blk_queue_max_segments(disk->queue, USHRT_MAX);
blk_queue_max_hw_sectors(disk->queue, 65536);
disk->queue->limits.max_sectors = 256;
mutex_init(&nbd->config_lock);
refcount_set(&nbd->config_refs, 0);
@ -2433,6 +2454,12 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
}
dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
if (!dev_list) {
nlmsg_free(reply);
ret = -EMSGSIZE;
goto out;
}
if (index == -1) {
ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
if (ret) {

View File

@ -115,6 +115,18 @@ module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444);
MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>");
#endif
/*
* Historic queue modes.
*
* These days nothing but NULL_Q_MQ is actually supported, but we keep it the
* enum for error reporting.
*/
enum {
NULL_Q_BIO = 0,
NULL_Q_RQ = 1,
NULL_Q_MQ = 2,
};
static int g_queue_mode = NULL_Q_MQ;
static int null_param_store_val(const char *str, int *val, int min, int max)
@ -165,8 +177,8 @@ static bool g_blocking;
module_param_named(blocking, g_blocking, bool, 0444);
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
static bool shared_tags;
module_param(shared_tags, bool, 0444);
static bool g_shared_tags;
module_param_named(shared_tags, g_shared_tags, bool, 0444);
MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
static bool g_shared_tag_bitmap;
@ -426,6 +438,7 @@ NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
NULLB_DEVICE_ATTR(no_sched, bool, NULL);
NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
@ -571,6 +584,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_zone_offline,
&nullb_device_attr_virt_boundary,
&nullb_device_attr_no_sched,
&nullb_device_attr_shared_tags,
&nullb_device_attr_shared_tag_bitmap,
NULL,
};
@ -653,10 +667,11 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page)
"badblocks,blocking,blocksize,cache_size,"
"completion_nsec,discard,home_node,hw_queue_depth,"
"irqmode,max_sectors,mbps,memory_backed,no_sched,"
"poll_queues,power,queue_mode,shared_tag_bitmap,size,"
"submit_queues,use_per_node_hctx,virt_boundary,zoned,"
"zone_capacity,zone_max_active,zone_max_open,"
"zone_nr_conv,zone_offline,zone_readonly,zone_size\n");
"poll_queues,power,queue_mode,shared_tag_bitmap,"
"shared_tags,size,submit_queues,use_per_node_hctx,"
"virt_boundary,zoned,zone_capacity,zone_max_active,"
"zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
"zone_size\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
@ -738,6 +753,7 @@ static struct nullb_device *null_alloc_dev(void)
dev->zone_max_active = g_zone_max_active;
dev->virt_boundary = g_virt_boundary;
dev->no_sched = g_no_sched;
dev->shared_tags = g_shared_tags;
dev->shared_tag_bitmap = g_shared_tag_bitmap;
return dev;
}
@ -752,98 +768,11 @@ static void null_free_dev(struct nullb_device *dev)
kfree(dev);
}
static void put_tag(struct nullb_queue *nq, unsigned int tag)
{
clear_bit_unlock(tag, nq->tag_map);
if (waitqueue_active(&nq->wait))
wake_up(&nq->wait);
}
static unsigned int get_tag(struct nullb_queue *nq)
{
unsigned int tag;
do {
tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
if (tag >= nq->queue_depth)
return -1U;
} while (test_and_set_bit_lock(tag, nq->tag_map));
return tag;
}
static void free_cmd(struct nullb_cmd *cmd)
{
put_tag(cmd->nq, cmd->tag);
}
static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
{
struct nullb_cmd *cmd;
unsigned int tag;
tag = get_tag(nq);
if (tag != -1U) {
cmd = &nq->cmds[tag];
cmd->tag = tag;
cmd->error = BLK_STS_OK;
cmd->nq = nq;
if (nq->dev->irqmode == NULL_IRQ_TIMER) {
hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
cmd->timer.function = null_cmd_timer_expired;
}
return cmd;
}
return NULL;
}
static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio)
{
struct nullb_cmd *cmd;
DEFINE_WAIT(wait);
do {
/*
* This avoids multiple return statements, multiple calls to
* __alloc_cmd() and a fast path call to prepare_to_wait().
*/
cmd = __alloc_cmd(nq);
if (cmd) {
cmd->bio = bio;
return cmd;
}
prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
io_schedule();
finish_wait(&nq->wait, &wait);
} while (1);
}
static void end_cmd(struct nullb_cmd *cmd)
{
int queue_mode = cmd->nq->dev->queue_mode;
switch (queue_mode) {
case NULL_Q_MQ:
blk_mq_end_request(cmd->rq, cmd->error);
return;
case NULL_Q_BIO:
cmd->bio->bi_status = cmd->error;
bio_endio(cmd->bio);
break;
}
free_cmd(cmd);
}
static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
{
end_cmd(container_of(timer, struct nullb_cmd, timer));
struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer);
blk_mq_end_request(blk_mq_rq_from_pdu(cmd), cmd->error);
return HRTIMER_NORESTART;
}
@ -856,7 +785,9 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
static void null_complete_rq(struct request *rq)
{
end_cmd(blk_mq_rq_to_pdu(rq));
struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
blk_mq_end_request(rq, cmd->error);
}
static struct nullb_page *null_alloc_page(void)
@ -1273,7 +1204,7 @@ static int null_transfer(struct nullb *nullb, struct page *page,
static int null_handle_rq(struct nullb_cmd *cmd)
{
struct request *rq = cmd->rq;
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb *nullb = cmd->nq->dev->nullb;
int err;
unsigned int len;
@ -1298,63 +1229,21 @@ static int null_handle_rq(struct nullb_cmd *cmd)
return 0;
}
static int null_handle_bio(struct nullb_cmd *cmd)
{
struct bio *bio = cmd->bio;
struct nullb *nullb = cmd->nq->dev->nullb;
int err;
unsigned int len;
sector_t sector = bio->bi_iter.bi_sector;
struct bio_vec bvec;
struct bvec_iter iter;
spin_lock_irq(&nullb->lock);
bio_for_each_segment(bvec, bio, iter) {
len = bvec.bv_len;
err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
op_is_write(bio_op(bio)), sector,
bio->bi_opf & REQ_FUA);
if (err) {
spin_unlock_irq(&nullb->lock);
return err;
}
sector += len >> SECTOR_SHIFT;
}
spin_unlock_irq(&nullb->lock);
return 0;
}
static void null_stop_queue(struct nullb *nullb)
{
struct request_queue *q = nullb->q;
if (nullb->dev->queue_mode == NULL_Q_MQ)
blk_mq_stop_hw_queues(q);
}
static void null_restart_queue_async(struct nullb *nullb)
{
struct request_queue *q = nullb->q;
if (nullb->dev->queue_mode == NULL_Q_MQ)
blk_mq_start_stopped_hw_queues(q, true);
}
static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
{
struct nullb_device *dev = cmd->nq->dev;
struct nullb *nullb = dev->nullb;
blk_status_t sts = BLK_STS_OK;
struct request *rq = cmd->rq;
struct request *rq = blk_mq_rq_from_pdu(cmd);
if (!hrtimer_active(&nullb->bw_timer))
hrtimer_restart(&nullb->bw_timer);
if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) {
null_stop_queue(nullb);
blk_mq_stop_hw_queues(nullb->q);
/* race with timer */
if (atomic_long_read(&nullb->cur_bytes) > 0)
null_restart_queue_async(nullb);
blk_mq_start_stopped_hw_queues(nullb->q, true);
/* requeue request */
sts = BLK_STS_DEV_RESOURCE;
}
@ -1381,37 +1270,29 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
sector_t nr_sectors)
{
struct nullb_device *dev = cmd->nq->dev;
int err;
if (op == REQ_OP_DISCARD)
return null_handle_discard(dev, sector, nr_sectors);
return errno_to_blk_status(null_handle_rq(cmd));
if (dev->queue_mode == NULL_Q_BIO)
err = null_handle_bio(cmd);
else
err = null_handle_rq(cmd);
return errno_to_blk_status(err);
}
static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb_device *dev = cmd->nq->dev;
struct bio *bio;
if (dev->memory_backed)
return;
if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) {
zero_fill_bio(cmd->bio);
} else if (req_op(cmd->rq) == REQ_OP_READ) {
__rq_for_each_bio(bio, cmd->rq)
if (!dev->memory_backed && req_op(rq) == REQ_OP_READ) {
__rq_for_each_bio(bio, rq)
zero_fill_bio(bio);
}
}
static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
{
struct request *rq = blk_mq_rq_from_pdu(cmd);
/*
* Since root privileges are required to configure the null_blk
* driver, it is fine that this driver does not initialize the
@ -1425,20 +1306,10 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
/* Complete IO by inline, softirq or timer */
switch (cmd->nq->dev->irqmode) {
case NULL_IRQ_SOFTIRQ:
switch (cmd->nq->dev->queue_mode) {
case NULL_Q_MQ:
blk_mq_complete_request(cmd->rq);
break;
case NULL_Q_BIO:
/*
* XXX: no proper submitting cpu information available.
*/
end_cmd(cmd);
break;
}
blk_mq_complete_request(rq);
break;
case NULL_IRQ_NONE:
end_cmd(cmd);
blk_mq_end_request(rq, cmd->error);
break;
case NULL_IRQ_TIMER:
null_cmd_end_timer(cmd);
@ -1499,7 +1370,7 @@ static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
null_restart_queue_async(nullb);
blk_mq_start_stopped_hw_queues(nullb->q, true);
hrtimer_forward_now(&nullb->bw_timer, timer_interval);
@ -1516,26 +1387,6 @@ static void nullb_setup_bwtimer(struct nullb *nullb)
hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
}
static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
{
int index = 0;
if (nullb->nr_queues != 1)
index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
return &nullb->queues[index];
}
static void null_submit_bio(struct bio *bio)
{
sector_t sector = bio->bi_iter.bi_sector;
sector_t nr_sectors = bio_sectors(bio);
struct nullb *nullb = bio->bi_bdev->bd_disk->private_data;
struct nullb_queue *nq = nullb_to_queue(nullb);
null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio));
}
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
static bool should_timeout_request(struct request *rq)
@ -1655,7 +1506,7 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
blk_rq_sectors(req));
if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error,
blk_mq_end_request_batch))
end_cmd(cmd);
blk_mq_end_request(req, cmd->error);
nr++;
}
@ -1711,7 +1562,6 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cmd->timer.function = null_cmd_timer_expired;
}
cmd->rq = rq;
cmd->error = BLK_STS_OK;
cmd->nq = nq;
cmd->fake_timeout = should_timeout_request(rq) ||
@ -1770,34 +1620,8 @@ static void null_queue_rqs(struct request **rqlist)
*rqlist = requeue_list;
}
static void cleanup_queue(struct nullb_queue *nq)
{
bitmap_free(nq->tag_map);
kfree(nq->cmds);
}
static void cleanup_queues(struct nullb *nullb)
{
int i;
for (i = 0; i < nullb->nr_queues; i++)
cleanup_queue(&nullb->queues[i]);
kfree(nullb->queues);
}
static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
struct nullb_queue *nq = hctx->driver_data;
struct nullb *nullb = nq->dev->nullb;
nullb->nr_queues--;
}
static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
{
init_waitqueue_head(&nq->wait);
nq->queue_depth = nullb->queue_depth;
nq->dev = nullb->dev;
INIT_LIST_HEAD(&nq->poll_list);
spin_lock_init(&nq->poll_lock);
@ -1815,7 +1639,6 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
nq = &nullb->queues[hctx_idx];
hctx->driver_data = nq;
null_init_queue(nullb, nq);
nullb->nr_queues++;
return 0;
}
@ -1828,7 +1651,6 @@ static const struct blk_mq_ops null_mq_ops = {
.poll = null_poll,
.map_queues = null_map_queues,
.init_hctx = null_init_hctx,
.exit_hctx = null_exit_hctx,
};
static void null_del_dev(struct nullb *nullb)
@ -1849,21 +1671,20 @@ static void null_del_dev(struct nullb *nullb)
if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
hrtimer_cancel(&nullb->bw_timer);
atomic_long_set(&nullb->cur_bytes, LONG_MAX);
null_restart_queue_async(nullb);
blk_mq_start_stopped_hw_queues(nullb->q, true);
}
put_disk(nullb->disk);
if (dev->queue_mode == NULL_Q_MQ &&
nullb->tag_set == &nullb->__tag_set)
if (nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set);
cleanup_queues(nullb);
kfree(nullb->queues);
if (null_cache_active(nullb))
null_free_device_storage(nullb->dev, true);
kfree(nullb);
dev->nullb = NULL;
}
static void null_config_discard(struct nullb *nullb)
static void null_config_discard(struct nullb *nullb, struct queue_limits *lim)
{
if (nullb->dev->discard == false)
return;
@ -1880,43 +1701,14 @@ static void null_config_discard(struct nullb *nullb)
return;
}
blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
lim->max_hw_discard_sectors = UINT_MAX >> 9;
}
static const struct block_device_operations null_bio_ops = {
.owner = THIS_MODULE,
.submit_bio = null_submit_bio,
.report_zones = null_report_zones,
};
static const struct block_device_operations null_rq_ops = {
static const struct block_device_operations null_ops = {
.owner = THIS_MODULE,
.report_zones = null_report_zones,
};
static int setup_commands(struct nullb_queue *nq)
{
struct nullb_cmd *cmd;
int i;
nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
if (!nq->cmds)
return -ENOMEM;
nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL);
if (!nq->tag_map) {
kfree(nq->cmds);
return -ENOMEM;
}
for (i = 0; i < nq->queue_depth; i++) {
cmd = &nq->cmds[i];
cmd->tag = -1U;
}
return 0;
}
static int setup_queues(struct nullb *nullb)
{
int nqueues = nr_cpu_ids;
@ -1929,101 +1721,66 @@ static int setup_queues(struct nullb *nullb)
if (!nullb->queues)
return -ENOMEM;
nullb->queue_depth = nullb->dev->hw_queue_depth;
return 0;
}
static int init_driver_queues(struct nullb *nullb)
static int null_init_tag_set(struct blk_mq_tag_set *set, int poll_queues)
{
struct nullb_queue *nq;
int i, ret = 0;
for (i = 0; i < nullb->dev->submit_queues; i++) {
nq = &nullb->queues[i];
null_init_queue(nullb, nq);
ret = setup_commands(nq);
if (ret)
return ret;
nullb->nr_queues++;
}
return 0;
}
static int null_gendisk_register(struct nullb *nullb)
{
sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT;
struct gendisk *disk = nullb->disk;
set_capacity(disk, size);
disk->major = null_major;
disk->first_minor = nullb->index;
disk->minors = 1;
if (queue_is_mq(nullb->q))
disk->fops = &null_rq_ops;
else
disk->fops = &null_bio_ops;
disk->private_data = nullb;
strscpy_pad(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
if (nullb->dev->zoned) {
int ret = null_register_zoned_dev(nullb);
if (ret)
return ret;
}
return add_disk(disk);
}
static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
{
unsigned int flags = BLK_MQ_F_SHOULD_MERGE;
int hw_queues, numa_node;
unsigned int queue_depth;
int poll_queues;
if (nullb) {
hw_queues = nullb->dev->submit_queues;
poll_queues = nullb->dev->poll_queues;
queue_depth = nullb->dev->hw_queue_depth;
numa_node = nullb->dev->home_node;
if (nullb->dev->no_sched)
flags |= BLK_MQ_F_NO_SCHED;
if (nullb->dev->shared_tag_bitmap)
flags |= BLK_MQ_F_TAG_HCTX_SHARED;
if (nullb->dev->blocking)
flags |= BLK_MQ_F_BLOCKING;
} else {
hw_queues = g_submit_queues;
poll_queues = g_poll_queues;
queue_depth = g_hw_queue_depth;
numa_node = g_home_node;
if (g_no_sched)
flags |= BLK_MQ_F_NO_SCHED;
if (g_shared_tag_bitmap)
flags |= BLK_MQ_F_TAG_HCTX_SHARED;
if (g_blocking)
flags |= BLK_MQ_F_BLOCKING;
}
set->ops = &null_mq_ops;
set->cmd_size = sizeof(struct nullb_cmd);
set->flags = flags;
set->driver_data = nullb;
set->nr_hw_queues = hw_queues;
set->queue_depth = queue_depth;
set->numa_node = numa_node;
set->cmd_size = sizeof(struct nullb_cmd);
set->timeout = 5 * HZ;
set->nr_maps = 1;
if (poll_queues) {
set->nr_hw_queues += poll_queues;
set->nr_maps = 3;
} else {
set->nr_maps = 1;
set->nr_maps += 2;
}
return blk_mq_alloc_tag_set(set);
}
static int null_init_global_tag_set(void)
{
int error;
if (tag_set.ops)
return 0;
tag_set.nr_hw_queues = g_submit_queues;
tag_set.queue_depth = g_hw_queue_depth;
tag_set.numa_node = g_home_node;
tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
if (g_no_sched)
tag_set.flags |= BLK_MQ_F_NO_SCHED;
if (g_shared_tag_bitmap)
tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED;
if (g_blocking)
tag_set.flags |= BLK_MQ_F_BLOCKING;
error = null_init_tag_set(&tag_set, g_poll_queues);
if (error)
tag_set.ops = NULL;
return error;
}
static int null_setup_tagset(struct nullb *nullb)
{
if (nullb->dev->shared_tags) {
nullb->tag_set = &tag_set;
return null_init_global_tag_set();
}
return blk_mq_alloc_tag_set(set);
nullb->tag_set = &nullb->__tag_set;
nullb->tag_set->driver_data = nullb;
nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues;
nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth;
nullb->tag_set->numa_node = nullb->dev->home_node;
nullb->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
if (nullb->dev->no_sched)
nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED;
if (nullb->dev->shared_tag_bitmap)
nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
if (nullb->dev->blocking)
nullb->tag_set->flags |= BLK_MQ_F_BLOCKING;
return null_init_tag_set(nullb->tag_set, nullb->dev->poll_queues);
}
static int null_validate_conf(struct nullb_device *dev)
@ -2032,11 +1789,15 @@ static int null_validate_conf(struct nullb_device *dev)
pr_err("legacy IO path is no longer available\n");
return -EINVAL;
}
if (dev->queue_mode == NULL_Q_BIO) {
pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n");
dev->queue_mode = NULL_Q_MQ;
}
dev->blocksize = round_down(dev->blocksize, 512);
dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
if (dev->use_per_node_hctx) {
if (dev->submit_queues != nr_online_nodes)
dev->submit_queues = nr_online_nodes;
} else if (dev->submit_queues > nr_cpu_ids)
@ -2048,8 +1809,6 @@ static int null_validate_conf(struct nullb_device *dev)
if (dev->poll_queues > g_poll_queues)
dev->poll_queues = g_poll_queues;
dev->prev_poll_queues = dev->poll_queues;
dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
/* Do memory allocation, so set blocking */
@ -2060,9 +1819,6 @@ static int null_validate_conf(struct nullb_device *dev)
dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
dev->cache_size);
dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);
/* can not stop a queue */
if (dev->queue_mode == NULL_Q_BIO)
dev->mbps = 0;
if (dev->zoned &&
(!dev->zone_size || !is_power_of_2(dev->zone_size))) {
@ -2102,6 +1858,12 @@ static bool null_setup_fault(void)
static int null_add_dev(struct nullb_device *dev)
{
struct queue_limits lim = {
.logical_block_size = dev->blocksize,
.physical_block_size = dev->blocksize,
.max_hw_sectors = dev->max_sectors,
};
struct nullb *nullb;
int rv;
@ -2123,37 +1885,26 @@ static int null_add_dev(struct nullb_device *dev)
if (rv)
goto out_free_nullb;
if (dev->queue_mode == NULL_Q_MQ) {
if (shared_tags) {
nullb->tag_set = &tag_set;
rv = 0;
} else {
nullb->tag_set = &nullb->__tag_set;
rv = null_init_tag_set(nullb, nullb->tag_set);
}
rv = null_setup_tagset(nullb);
if (rv)
goto out_cleanup_queues;
if (dev->virt_boundary)
lim.virt_boundary_mask = PAGE_SIZE - 1;
null_config_discard(nullb, &lim);
if (dev->zoned) {
rv = null_init_zoned_dev(dev, &lim);
if (rv)
goto out_cleanup_queues;
nullb->tag_set->timeout = 5 * HZ;
nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb);
if (IS_ERR(nullb->disk)) {
rv = PTR_ERR(nullb->disk);
goto out_cleanup_tags;
}
nullb->q = nullb->disk->queue;
} else if (dev->queue_mode == NULL_Q_BIO) {
rv = -ENOMEM;
nullb->disk = blk_alloc_disk(nullb->dev->home_node);
if (!nullb->disk)
goto out_cleanup_queues;
nullb->q = nullb->disk->queue;
rv = init_driver_queues(nullb);
if (rv)
goto out_cleanup_disk;
}
nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb);
if (IS_ERR(nullb->disk)) {
rv = PTR_ERR(nullb->disk);
goto out_cleanup_zone;
}
nullb->q = nullb->disk->queue;
if (dev->mbps) {
set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
nullb_setup_bwtimer(nullb);
@ -2164,12 +1915,6 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_write_cache(nullb->q, true, true);
}
if (dev->zoned) {
rv = null_init_zoned_dev(dev, nullb->q);
if (rv)
goto out_cleanup_disk;
}
nullb->q->queuedata = nullb;
blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
@ -2177,22 +1922,12 @@ static int null_add_dev(struct nullb_device *dev)
rv = ida_alloc(&nullb_indexes, GFP_KERNEL);
if (rv < 0) {
mutex_unlock(&lock);
goto out_cleanup_zone;
goto out_cleanup_disk;
}
nullb->index = rv;
dev->index = rv;
mutex_unlock(&lock);
blk_queue_logical_block_size(nullb->q, dev->blocksize);
blk_queue_physical_block_size(nullb->q, dev->blocksize);
if (dev->max_sectors)
blk_queue_max_hw_sectors(nullb->q, dev->max_sectors);
if (dev->virt_boundary)
blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1);
null_config_discard(nullb);
if (config_item_name(&dev->group.cg_item)) {
/* Use configfs dir name as the device name */
snprintf(nullb->disk_name, sizeof(nullb->disk_name),
@ -2201,7 +1936,22 @@ static int null_add_dev(struct nullb_device *dev)
sprintf(nullb->disk_name, "nullb%d", nullb->index);
}
rv = null_gendisk_register(nullb);
set_capacity(nullb->disk,
((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT);
nullb->disk->major = null_major;
nullb->disk->first_minor = nullb->index;
nullb->disk->minors = 1;
nullb->disk->fops = &null_ops;
nullb->disk->private_data = nullb;
strscpy_pad(nullb->disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
if (nullb->dev->zoned) {
rv = null_register_zoned_dev(nullb);
if (rv)
goto out_ida_free;
}
rv = add_disk(nullb->disk);
if (rv)
goto out_ida_free;
@ -2220,10 +1970,10 @@ out_cleanup_zone:
out_cleanup_disk:
put_disk(nullb->disk);
out_cleanup_tags:
if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
if (nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set);
out_cleanup_queues:
cleanup_queues(nullb);
kfree(nullb->queues);
out_free_nullb:
kfree(nullb);
dev->nullb = NULL;
@ -2299,7 +2049,7 @@ static int __init null_init(void)
return -EINVAL;
}
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
if (g_use_per_node_hctx) {
if (g_submit_queues != nr_online_nodes) {
pr_warn("submit_queues param is set to %u.\n",
nr_online_nodes);
@ -2311,18 +2061,12 @@ static int __init null_init(void)
g_submit_queues = 1;
}
if (g_queue_mode == NULL_Q_MQ && shared_tags) {
ret = null_init_tag_set(NULL, &tag_set);
if (ret)
return ret;
}
config_group_init(&nullb_subsys.su_group);
mutex_init(&nullb_subsys.su_mutex);
ret = configfs_register_subsystem(&nullb_subsys);
if (ret)
goto err_tagset;
return ret;
mutex_init(&lock);
@ -2349,9 +2093,6 @@ err_dev:
unregister_blkdev(null_major, "nullb");
err_conf:
configfs_unregister_subsystem(&nullb_subsys);
err_tagset:
if (g_queue_mode == NULL_Q_MQ && shared_tags)
blk_mq_free_tag_set(&tag_set);
return ret;
}
@ -2370,7 +2111,7 @@ static void __exit null_exit(void)
}
mutex_unlock(&lock);
if (g_queue_mode == NULL_Q_MQ && shared_tags)
if (tag_set.ops)
blk_mq_free_tag_set(&tag_set);
}

View File

@ -16,11 +16,6 @@
#include <linux/mutex.h>
struct nullb_cmd {
union {
struct request *rq;
struct bio *bio;
};
unsigned int tag;
blk_status_t error;
bool fake_timeout;
struct nullb_queue *nq;
@ -28,16 +23,11 @@ struct nullb_cmd {
};
struct nullb_queue {
unsigned long *tag_map;
wait_queue_head_t wait;
unsigned int queue_depth;
struct nullb_device *dev;
unsigned int requeue_selection;
struct list_head poll_list;
spinlock_t poll_lock;
struct nullb_cmd *cmds;
};
struct nullb_zone {
@ -60,13 +50,6 @@ struct nullb_zone {
unsigned int capacity;
};
/* Queue modes */
enum {
NULL_Q_BIO = 0,
NULL_Q_RQ = 1,
NULL_Q_MQ = 2,
};
struct nullb_device {
struct nullb *nullb;
struct config_group group;
@ -119,6 +102,7 @@ struct nullb_device {
bool zoned; /* if device is zoned */
bool virt_boundary; /* virtual boundary on/off for the device */
bool no_sched; /* no IO scheduler for the device */
bool shared_tags; /* share tag set between devices for blk-mq */
bool shared_tag_bitmap; /* use hostwide shared tags */
};
@ -130,14 +114,12 @@ struct nullb {
struct gendisk *disk;
struct blk_mq_tag_set *tag_set;
struct blk_mq_tag_set __tag_set;
unsigned int queue_depth;
atomic_long_t cur_bytes;
struct hrtimer bw_timer;
unsigned long cache_flush_pos;
spinlock_t lock;
struct nullb_queue *queues;
unsigned int nr_queues;
char disk_name[DISK_NAME_LEN];
};
@ -147,7 +129,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, unsigned int nr_sectors);
#ifdef CONFIG_BLK_DEV_ZONED
int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q);
int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim);
int null_register_zoned_dev(struct nullb *nullb);
void null_free_zoned_dev(struct nullb_device *dev);
int null_report_zones(struct gendisk *disk, sector_t sector,
@ -160,7 +142,7 @@ ssize_t zone_cond_store(struct nullb_device *dev, const char *page,
size_t count, enum blk_zone_cond cond);
#else
static inline int null_init_zoned_dev(struct nullb_device *dev,
struct request_queue *q)
struct queue_limits *lim)
{
pr_err("CONFIG_BLK_DEV_ZONED not enabled\n");
return -EINVAL;

View File

@ -41,10 +41,11 @@ TRACE_EVENT(nullb_zone_op,
__field(unsigned int, zone_cond)
),
TP_fast_assign(
__entry->op = req_op(cmd->rq);
__entry->op = req_op(blk_mq_rq_from_pdu(cmd));
__entry->zone_no = zone_no;
__entry->zone_cond = zone_cond;
__assign_disk_name(__entry->disk, cmd->rq->q->disk);
__assign_disk_name(__entry->disk,
blk_mq_rq_from_pdu(cmd)->q->disk);
),
TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s",
__print_disk_name(__entry->disk),

View File

@ -58,7 +58,8 @@ static inline void null_unlock_zone(struct nullb_device *dev,
mutex_unlock(&zone->mutex);
}
int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
int null_init_zoned_dev(struct nullb_device *dev,
struct queue_limits *lim)
{
sector_t dev_capacity_sects, zone_capacity_sects;
struct nullb_zone *zone;
@ -151,27 +152,22 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
sector += dev->zone_size_sects;
}
lim->zoned = true;
lim->chunk_sectors = dev->zone_size_sects;
lim->max_zone_append_sectors = dev->zone_size_sects;
lim->max_open_zones = dev->zone_max_open;
lim->max_active_zones = dev->zone_max_active;
return 0;
}
int null_register_zoned_dev(struct nullb *nullb)
{
struct nullb_device *dev = nullb->dev;
struct request_queue *q = nullb->q;
disk_set_zoned(nullb->disk);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
blk_queue_chunk_sectors(q, dev->zone_size_sects);
nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0);
blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
disk_set_max_open_zones(nullb->disk, dev->zone_max_open);
disk_set_max_active_zones(nullb->disk, dev->zone_max_active);
if (queue_is_mq(q))
return blk_revalidate_disk_zones(nullb->disk, NULL);
return 0;
return blk_revalidate_disk_zones(nullb->disk, NULL);
}
void null_free_zoned_dev(struct nullb_device *dev)
@ -394,10 +390,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
*/
if (append) {
sector = zone->wp;
if (dev->queue_mode == NULL_Q_MQ)
cmd->rq->__sector = sector;
else
cmd->bio->bi_iter.bi_sector = sector;
blk_mq_rq_from_pdu(cmd)->__sector = sector;
} else if (sector != zone->wp) {
ret = BLK_STS_IOERR;
goto unlock;

View File

@ -828,6 +828,12 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
*/
static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
{
/*
* Some CDRW drives can not handle writes larger than one packet,
* even if the size is a multiple of the packet size.
*/
bio->bi_opf |= REQ_NOMERGE;
spin_lock(&pd->iosched.lock);
if (bio_data_dir(bio) == READ)
bio_list_add(&pd->iosched.read_queue, bio);
@ -2191,11 +2197,6 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
ret = pkt_open_write(pd);
if (ret)
goto out_putdev;
/*
* Some CDRW drives can not handle writes larger than one packet,
* even if the size is a multiple of the packet size.
*/
blk_queue_max_hw_sectors(q, pd->settings.size);
set_bit(PACKET_WRITABLE, &pd->flags);
} else {
pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
@ -2338,9 +2339,9 @@ static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
pkt_queue_bio(pd, cloned_bio);
}
static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
static void pkt_make_request_write(struct bio *bio)
{
struct pktcdvd_device *pd = q->queuedata;
struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data;
sector_t zone;
struct packet_data *pkt;
int was_empty, blocked_bio;
@ -2432,7 +2433,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
static void pkt_submit_bio(struct bio *bio)
{
struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data;
struct device *ddev = disk_to_dev(pd->disk);
struct bio *split;
@ -2476,7 +2477,7 @@ static void pkt_submit_bio(struct bio *bio)
split = bio;
}
pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split);
pkt_make_request_write(split);
} while (split != bio);
return;
@ -2484,15 +2485,6 @@ end_io:
bio_io_error(bio);
}
static void pkt_init_queue(struct pktcdvd_device *pd)
{
struct request_queue *q = pd->disk->queue;
blk_queue_logical_block_size(q, CD_FRAMESIZE);
blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS);
q->queuedata = pd;
}
static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
{
struct device *ddev = disk_to_dev(pd->disk);
@ -2536,8 +2528,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
pd->bdev_file = bdev_file;
set_blocksize(file_bdev(bdev_file), CD_FRAMESIZE);
pkt_init_queue(pd);
atomic_set(&pd->cdrw.pending_bios, 0);
pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name);
if (IS_ERR(pd->cdrw.thread)) {
@ -2634,6 +2624,10 @@ static const struct block_device_operations pktcdvd_ops = {
*/
static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
{
struct queue_limits lim = {
.max_hw_sectors = PACKET_MAX_SECTORS,
.logical_block_size = CD_FRAMESIZE,
};
int idx;
int ret = -ENOMEM;
struct pktcdvd_device *pd;
@ -2673,10 +2667,11 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
pd->write_congestion_on = write_congestion_on;
pd->write_congestion_off = write_congestion_off;
ret = -ENOMEM;
disk = blk_alloc_disk(NUMA_NO_NODE);
if (!disk)
disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(disk)) {
ret = PTR_ERR(disk);
goto out_mem;
}
pd->disk = disk;
disk->major = pktdev_major;
disk->first_minor = idx;

View File

@ -382,6 +382,14 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
struct ps3disk_private *priv;
int error;
unsigned int devidx;
struct queue_limits lim = {
.logical_block_size = dev->blk_size,
.max_hw_sectors = dev->bounce_size >> 9,
.max_segments = -1,
.max_segment_size = dev->bounce_size,
.dma_alignment = dev->blk_size - 1,
};
struct request_queue *queue;
struct gendisk *gendisk;
@ -431,7 +439,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
if (error)
goto fail_teardown;
gendisk = blk_mq_alloc_disk(&priv->tag_set, dev);
gendisk = blk_mq_alloc_disk(&priv->tag_set, &lim, dev);
if (IS_ERR(gendisk)) {
dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n",
__func__, __LINE__);
@ -441,15 +449,8 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
queue = gendisk->queue;
blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
blk_queue_dma_alignment(queue, dev->blk_size-1);
blk_queue_logical_block_size(queue, dev->blk_size);
blk_queue_write_cache(queue, true, false);
blk_queue_max_segments(queue, -1);
blk_queue_max_segment_size(queue, dev->bounce_size);
priv->gendisk = gendisk;
gendisk->major = ps3disk_major;
gendisk->first_minor = devidx * PS3DISK_MINORS;

View File

@ -730,10 +730,10 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
ps3vram_proc_init(dev);
gendisk = blk_alloc_disk(NUMA_NO_NODE);
if (!gendisk) {
gendisk = blk_alloc_disk(NULL, NUMA_NO_NODE);
if (IS_ERR(gendisk)) {
dev_err(&dev->core, "blk_alloc_disk failed\n");
error = -ENOMEM;
error = PTR_ERR(gendisk);
goto out_cache_cleanup;
}

View File

@ -575,7 +575,7 @@ static const struct attribute_group rbd_bus_group = {
};
__ATTRIBUTE_GROUPS(rbd_bus);
static struct bus_type rbd_bus_type = {
static const struct bus_type rbd_bus_type = {
.name = "rbd",
.bus_groups = rbd_bus_groups,
};
@ -4952,6 +4952,14 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
struct request_queue *q;
unsigned int objset_bytes =
rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
struct queue_limits lim = {
.max_hw_sectors = objset_bytes >> SECTOR_SHIFT,
.max_user_sectors = objset_bytes >> SECTOR_SHIFT,
.io_min = rbd_dev->opts->alloc_size,
.io_opt = rbd_dev->opts->alloc_size,
.max_segments = USHRT_MAX,
.max_segment_size = UINT_MAX,
};
int err;
memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
@ -4966,7 +4974,13 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
if (err)
return err;
disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev);
if (rbd_dev->opts->trim) {
lim.discard_granularity = rbd_dev->opts->alloc_size;
lim.max_hw_discard_sectors = objset_bytes >> SECTOR_SHIFT;
lim.max_write_zeroes_sectors = objset_bytes >> SECTOR_SHIFT;
}
disk = blk_mq_alloc_disk(&rbd_dev->tag_set, &lim, rbd_dev);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_tag_set;
@ -4987,19 +5001,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
q->limits.max_sectors = queue_max_hw_sectors(q);
blk_queue_max_segments(q, USHRT_MAX);
blk_queue_max_segment_size(q, UINT_MAX);
blk_queue_io_min(q, rbd_dev->opts->alloc_size);
blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
if (rbd_dev->opts->trim) {
q->limits.discard_granularity = rbd_dev->opts->alloc_size;
blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
}
if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);

View File

@ -1329,43 +1329,6 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
}
}
static void setup_request_queue(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp)
{
blk_queue_logical_block_size(dev->queue,
le16_to_cpu(rsp->logical_block_size));
blk_queue_physical_block_size(dev->queue,
le16_to_cpu(rsp->physical_block_size));
blk_queue_max_hw_sectors(dev->queue,
dev->sess->max_io_size / SECTOR_SIZE);
/*
* we don't support discards to "discontiguous" segments
* in on request
*/
blk_queue_max_discard_segments(dev->queue, 1);
blk_queue_max_discard_sectors(dev->queue,
le32_to_cpu(rsp->max_discard_sectors));
dev->queue->limits.discard_granularity =
le32_to_cpu(rsp->discard_granularity);
dev->queue->limits.discard_alignment =
le32_to_cpu(rsp->discard_alignment);
if (le16_to_cpu(rsp->secure_discard))
blk_queue_max_secure_erase_sectors(dev->queue,
le32_to_cpu(rsp->max_discard_sectors));
blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
blk_queue_max_segments(dev->queue, dev->sess->max_segments);
blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
blk_queue_virt_boundary(dev->queue, SZ_4K - 1);
blk_queue_write_cache(dev->queue,
!!(rsp->cache_policy & RNBD_WRITEBACK),
!!(rsp->cache_policy & RNBD_FUA));
blk_queue_max_write_zeroes_sectors(dev->queue,
le32_to_cpu(rsp->max_write_zeroes_sectors));
}
static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp, int idx)
{
@ -1403,18 +1366,41 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
static int rnbd_client_setup_device(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp)
{
struct queue_limits lim = {
.logical_block_size = le16_to_cpu(rsp->logical_block_size),
.physical_block_size = le16_to_cpu(rsp->physical_block_size),
.io_opt = dev->sess->max_io_size,
.max_hw_sectors = dev->sess->max_io_size / SECTOR_SIZE,
.max_hw_discard_sectors = le32_to_cpu(rsp->max_discard_sectors),
.discard_granularity = le32_to_cpu(rsp->discard_granularity),
.discard_alignment = le32_to_cpu(rsp->discard_alignment),
.max_segments = dev->sess->max_segments,
.virt_boundary_mask = SZ_4K - 1,
.max_write_zeroes_sectors =
le32_to_cpu(rsp->max_write_zeroes_sectors),
};
int idx = dev->clt_device_id;
dev->size = le64_to_cpu(rsp->nsectors) *
le16_to_cpu(rsp->logical_block_size);
dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev);
if (rsp->secure_discard) {
lim.max_secure_erase_sectors =
le32_to_cpu(rsp->max_discard_sectors);
}
dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, &lim, dev);
if (IS_ERR(dev->gd))
return PTR_ERR(dev->gd);
dev->queue = dev->gd->queue;
rnbd_init_mq_hw_queues(dev);
setup_request_queue(dev, rsp);
blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
blk_queue_write_cache(dev->queue,
!!(rsp->cache_policy & RNBD_WRITEBACK),
!!(rsp->cache_policy & RNBD_FUA));
return rnbd_clt_setup_gen_disk(dev, rsp, idx);
}

View File

@ -784,6 +784,14 @@ static const struct blk_mq_ops vdc_mq_ops = {
static int probe_disk(struct vdc_port *port)
{
struct queue_limits lim = {
.physical_block_size = port->vdisk_phys_blksz,
.max_hw_sectors = port->max_xfer_size,
/* Each segment in a request is up to an aligned page in size. */
.seg_boundary_mask = PAGE_SIZE - 1,
.max_segment_size = PAGE_SIZE,
.max_segments = port->ring_cookies,
};
struct request_queue *q;
struct gendisk *g;
int err;
@ -824,7 +832,7 @@ static int probe_disk(struct vdc_port *port)
if (err)
return err;
g = blk_mq_alloc_disk(&port->tag_set, port);
g = blk_mq_alloc_disk(&port->tag_set, &lim, port);
if (IS_ERR(g)) {
printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
port->vio.name);
@ -835,12 +843,6 @@ static int probe_disk(struct vdc_port *port)
port->disk = g;
q = g->queue;
/* Each segment in a request is up to an aligned page in size. */
blk_queue_segment_boundary(q, PAGE_SIZE - 1);
blk_queue_max_segment_size(q, PAGE_SIZE);
blk_queue_max_segments(q, port->ring_cookies);
blk_queue_max_hw_sectors(q, port->max_xfer_size);
g->major = vdc_major;
g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT;
g->minors = 1 << PARTITION_SHIFT;
@ -872,8 +874,6 @@ static int probe_disk(struct vdc_port *port)
}
}
blk_queue_physical_block_size(q, port->vdisk_phys_blksz);
pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n",
g->disk_name,
port->vdisk_size, (port->vdisk_size >> (20 - 9)),

View File

@ -820,7 +820,7 @@ static int swim_floppy_init(struct swim_priv *swd)
goto exit_put_disks;
swd->unit[drive].disk =
blk_mq_alloc_disk(&swd->unit[drive].tag_set,
blk_mq_alloc_disk(&swd->unit[drive].tag_set, NULL,
&swd->unit[drive]);
if (IS_ERR(swd->unit[drive].disk)) {
blk_mq_free_tag_set(&swd->unit[drive].tag_set);
@ -916,7 +916,7 @@ out:
return ret;
}
static int swim_remove(struct platform_device *dev)
static void swim_remove(struct platform_device *dev)
{
struct swim_priv *swd = platform_get_drvdata(dev);
int drive;
@ -937,13 +937,11 @@ static int swim_remove(struct platform_device *dev)
release_mem_region(res->start, resource_size(res));
kfree(swd);
return 0;
}
static struct platform_driver swim_driver = {
.probe = swim_probe,
.remove = swim_remove,
.remove_new = swim_remove,
.driver = {
.name = CARDNAME,
},

View File

@ -1210,7 +1210,7 @@ static int swim3_attach(struct macio_dev *mdev,
if (rc)
goto out_unregister;
disk = blk_mq_alloc_disk(&fs->tag_set, fs);
disk = blk_mq_alloc_disk(&fs->tag_set, NULL, fs);
if (IS_ERR(disk)) {
rc = PTR_ERR(disk);
goto out_free_tag_set;

View File

@ -246,21 +246,12 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
return 0;
}
static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
const struct ublk_param_zoned *p = &ub->params.zoned;
disk_set_zoned(ub->ub_disk);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
blk_queue_required_elevator_features(ub->ub_disk->queue,
ELEVATOR_F_ZBD_SEQ_WRITE);
disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
return 0;
}
/* Based on virtblk_alloc_report_buffer */
@ -432,9 +423,8 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
return -EOPNOTSUPP;
}
static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
return -EOPNOTSUPP;
}
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
@ -498,11 +488,6 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
struct request_queue *q = ub->ub_disk->queue;
const struct ublk_param_basic *p = &ub->params.basic;
blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
blk_queue_io_min(q, 1 << p->io_min_shift);
blk_queue_io_opt(q, 1 << p->io_opt_shift);
blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
p->attrs & UBLK_ATTR_FUA);
if (p->attrs & UBLK_ATTR_ROTATIONAL)
@ -510,29 +495,12 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
else
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
blk_queue_max_hw_sectors(q, p->max_sectors);
blk_queue_chunk_sectors(q, p->chunk_sectors);
blk_queue_virt_boundary(q, p->virt_boundary_mask);
if (p->attrs & UBLK_ATTR_READ_ONLY)
set_disk_ro(ub->ub_disk, true);
set_capacity(ub->ub_disk, p->dev_sectors);
}
static void ublk_dev_param_discard_apply(struct ublk_device *ub)
{
struct request_queue *q = ub->ub_disk->queue;
const struct ublk_param_discard *p = &ub->params.discard;
q->limits.discard_alignment = p->discard_alignment;
q->limits.discard_granularity = p->discard_granularity;
blk_queue_max_discard_sectors(q, p->max_discard_sectors);
blk_queue_max_write_zeroes_sectors(q,
p->max_write_zeroes_sectors);
blk_queue_max_discard_segments(q, p->max_discard_segments);
}
static int ublk_validate_params(const struct ublk_device *ub)
{
/* basic param is the only one which must be set */
@ -576,20 +544,12 @@ static int ublk_validate_params(const struct ublk_device *ub)
return 0;
}
static int ublk_apply_params(struct ublk_device *ub)
static void ublk_apply_params(struct ublk_device *ub)
{
if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
return -EINVAL;
ublk_dev_param_basic_apply(ub);
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
ublk_dev_param_discard_apply(ub);
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
return ublk_dev_param_zoned_apply(ub);
return 0;
ublk_dev_param_zoned_apply(ub);
}
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
@ -645,14 +605,16 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_NEED_GET_DATA;
}
static struct ublk_device *ublk_get_device(struct ublk_device *ub)
/* Called in slow path only, keep it noinline for trace purpose */
static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
{
if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
return ub;
return NULL;
}
static void ublk_put_device(struct ublk_device *ub)
/* Called in slow path only, keep it noinline for trace purpose */
static noinline void ublk_put_device(struct ublk_device *ub)
{
put_device(&ub->cdev_dev);
}
@ -711,7 +673,7 @@ static void ublk_free_disk(struct gendisk *disk)
struct ublk_device *ub = disk->private_data;
clear_bit(UB_STATE_USED, &ub->state);
put_device(&ub->cdev_dev);
ublk_put_device(ub);
}
static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
@ -2182,7 +2144,7 @@ static void ublk_remove(struct ublk_device *ub)
cancel_work_sync(&ub->stop_work);
cancel_work_sync(&ub->quiesce_work);
cdev_device_del(&ub->cdev, &ub->cdev_dev);
put_device(&ub->cdev_dev);
ublk_put_device(ub);
ublks_added--;
}
@ -2205,12 +2167,47 @@ static struct ublk_device *ublk_get_device_from_id(int idx)
static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
{
const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
const struct ublk_param_basic *p = &ub->params.basic;
int ublksrv_pid = (int)header->data[0];
struct queue_limits lim = {
.logical_block_size = 1 << p->logical_bs_shift,
.physical_block_size = 1 << p->physical_bs_shift,
.io_min = 1 << p->io_min_shift,
.io_opt = 1 << p->io_opt_shift,
.max_hw_sectors = p->max_sectors,
.chunk_sectors = p->chunk_sectors,
.virt_boundary_mask = p->virt_boundary_mask,
};
struct gendisk *disk;
int ret = -EINVAL;
if (ublksrv_pid <= 0)
return -EINVAL;
if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
return -EINVAL;
if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
const struct ublk_param_discard *pd = &ub->params.discard;
lim.discard_alignment = pd->discard_alignment;
lim.discard_granularity = pd->discard_granularity;
lim.max_hw_discard_sectors = pd->max_discard_sectors;
lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
lim.max_discard_segments = pd->max_discard_segments;
}
if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
const struct ublk_param_zoned *p = &ub->params.zoned;
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
return -EOPNOTSUPP;
lim.zoned = true;
lim.max_active_zones = p->max_active_zones;
lim.max_open_zones = p->max_open_zones;
lim.max_zone_append_sectors = p->max_zone_append_sectors;
}
if (wait_for_completion_interruptible(&ub->completion) != 0)
return -EINTR;
@ -2222,7 +2219,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
goto out_unlock;
}
disk = blk_mq_alloc_disk(&ub->tag_set, NULL);
disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
if (IS_ERR(disk)) {
ret = PTR_ERR(disk);
goto out_unlock;
@ -2234,15 +2231,13 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
ub->dev_info.ublksrv_pid = ublksrv_pid;
ub->ub_disk = disk;
ret = ublk_apply_params(ub);
if (ret)
goto out_put_disk;
ublk_apply_params(ub);
/* don't probe partitions if any one ubq daemon is un-trusted */
if (ub->nr_privileged_daemon != ub->nr_queues_ready)
set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
get_device(&ub->cdev_dev);
ublk_get_device(ub);
ub->dev_info.state = UBLK_S_DEV_LIVE;
if (ublk_dev_is_zoned(ub)) {
@ -2262,7 +2257,6 @@ out_put_cdev:
ub->dev_info.state = UBLK_S_DEV_DEAD;
ublk_put_device(ub);
}
out_put_disk:
if (ret)
put_disk(disk);
out_unlock:
@ -2474,7 +2468,7 @@ static inline bool ublk_idr_freed(int id)
return ptr == NULL;
}
static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
{
struct ublk_device *ub = *p_ub;
int idx = ub->ub_number;
@ -2508,7 +2502,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
* - the device number is freed already, we will not find this
* device via ublk_get_device_from_id()
*/
if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
return -EINTR;
return 0;
}
@ -2907,7 +2901,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
ret = ublk_ctrl_add_dev(cmd);
break;
case UBLK_CMD_DEL_DEV:
ret = ublk_ctrl_del_dev(&ub);
ret = ublk_ctrl_del_dev(&ub, true);
break;
case UBLK_U_CMD_DEL_DEV_ASYNC:
ret = ublk_ctrl_del_dev(&ub, false);
break;
case UBLK_CMD_GET_QUEUE_AFFINITY:
ret = ublk_ctrl_get_queue_affinity(ub, cmd);

View File

@ -720,25 +720,24 @@ fail_report:
return ret;
}
static int virtblk_probe_zoned_device(struct virtio_device *vdev,
struct virtio_blk *vblk,
struct request_queue *q)
static int virtblk_read_zoned_limits(struct virtio_blk *vblk,
struct queue_limits *lim)
{
struct virtio_device *vdev = vblk->vdev;
u32 v, wg;
dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
disk_set_zoned(vblk->disk);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
lim->zoned = true;
virtio_cread(vdev, struct virtio_blk_config,
zoned.max_open_zones, &v);
disk_set_max_open_zones(vblk->disk, v);
lim->max_open_zones = v;
dev_dbg(&vdev->dev, "max open zones = %u\n", v);
virtio_cread(vdev, struct virtio_blk_config,
zoned.max_active_zones, &v);
disk_set_max_active_zones(vblk->disk, v);
lim->max_active_zones = v;
dev_dbg(&vdev->dev, "max active zones = %u\n", v);
virtio_cread(vdev, struct virtio_blk_config,
@ -747,8 +746,8 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
dev_warn(&vdev->dev, "zero write granularity reported\n");
return -ENODEV;
}
blk_queue_physical_block_size(q, wg);
blk_queue_io_min(q, wg);
lim->physical_block_size = wg;
lim->io_min = wg;
dev_dbg(&vdev->dev, "write granularity = %u\n", wg);
@ -764,13 +763,13 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
vblk->zone_sectors);
return -ENODEV;
}
blk_queue_chunk_sectors(q, vblk->zone_sectors);
lim->chunk_sectors = vblk->zone_sectors;
dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors);
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
dev_warn(&vblk->vdev->dev,
"ignoring negotiated F_DISCARD for zoned device\n");
blk_queue_max_discard_sectors(q, 0);
lim->max_hw_discard_sectors = 0;
}
virtio_cread(vdev, struct virtio_blk_config,
@ -785,25 +784,21 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
wg, v);
return -ENODEV;
}
blk_queue_max_zone_append_sectors(q, v);
lim->max_zone_append_sectors = v;
dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
return blk_revalidate_disk_zones(vblk->disk, NULL);
return 0;
}
#else
/*
* Zoned block device support is not configured in this kernel.
* Host-managed zoned devices can't be supported, but others are
* good to go as regular block devices.
* Zoned block device support is not configured in this kernel, host-managed
* zoned devices can't be supported.
*/
#define virtblk_report_zones NULL
static inline int virtblk_probe_zoned_device(struct virtio_device *vdev,
struct virtio_blk *vblk, struct request_queue *q)
static inline int virtblk_read_zoned_limits(struct virtio_blk *vblk,
struct queue_limits *lim)
{
dev_err(&vdev->dev,
dev_err(&vblk->vdev->dev,
"virtio_blk: zoned devices are not supported");
return -EOPNOTSUPP;
}
@ -1248,31 +1243,17 @@ static const struct blk_mq_ops virtio_mq_ops = {
static unsigned int virtblk_queue_depth;
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
static int virtblk_probe(struct virtio_device *vdev)
static int virtblk_read_limits(struct virtio_blk *vblk,
struct queue_limits *lim)
{
struct virtio_blk *vblk;
struct request_queue *q;
int err, index;
struct virtio_device *vdev = vblk->vdev;
u32 v, blk_size, max_size, sg_elems, opt_io_size;
u32 max_discard_segs = 0;
u32 discard_granularity = 0;
u16 min_io_size;
u8 physical_block_exp, alignment_offset;
unsigned int queue_depth;
size_t max_dma_size;
if (!vdev->config->get) {
dev_err(&vdev->dev, "%s failure: config access disabled\n",
__func__);
return -EINVAL;
}
err = ida_alloc_range(&vd_index_ida, 0,
minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL);
if (err < 0)
goto out;
index = err;
int err;
/* We need to know how many segments before we allocate. */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
@ -1286,6 +1267,203 @@ static int virtblk_probe(struct virtio_device *vdev)
/* Prevent integer overflows and honor max vq size */
sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2);
/* We can handle whatever the host told us to handle. */
lim->max_segments = sg_elems;
/* No real sector limit. */
lim->max_hw_sectors = UINT_MAX;
max_dma_size = virtio_max_dma_size(vdev);
max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size;
/* Host can optionally specify maximum segment size and number of
* segments. */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
struct virtio_blk_config, size_max, &v);
if (!err)
max_size = min(max_size, v);
lim->max_segment_size = max_size;
/* Host can optionally specify the block size of the device */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
struct virtio_blk_config, blk_size,
&blk_size);
if (!err) {
err = blk_validate_block_size(blk_size);
if (err) {
dev_err(&vdev->dev,
"virtio_blk: invalid block size: 0x%x\n",
blk_size);
return err;
}
lim->logical_block_size = blk_size;
} else
blk_size = lim->logical_block_size;
/* Use topology information if available */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, physical_block_exp,
&physical_block_exp);
if (!err && physical_block_exp)
lim->physical_block_size = blk_size * (1 << physical_block_exp);
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, alignment_offset,
&alignment_offset);
if (!err && alignment_offset)
lim->alignment_offset = blk_size * alignment_offset;
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, min_io_size,
&min_io_size);
if (!err && min_io_size)
lim->io_min = blk_size * min_io_size;
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, opt_io_size,
&opt_io_size);
if (!err && opt_io_size)
lim->io_opt = blk_size * opt_io_size;
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
virtio_cread(vdev, struct virtio_blk_config,
discard_sector_alignment, &discard_granularity);
virtio_cread(vdev, struct virtio_blk_config,
max_discard_sectors, &v);
lim->max_hw_discard_sectors = v ? v : UINT_MAX;
virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
&max_discard_segs);
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
virtio_cread(vdev, struct virtio_blk_config,
max_write_zeroes_sectors, &v);
lim->max_write_zeroes_sectors = v ? v : UINT_MAX;
}
/* The discard and secure erase limits are combined since the Linux
* block layer uses the same limit for both commands.
*
* If both VIRTIO_BLK_F_SECURE_ERASE and VIRTIO_BLK_F_DISCARD features
* are negotiated, we will use the minimum between the limits.
*
* discard sector alignment is set to the minimum between discard_sector_alignment
* and secure_erase_sector_alignment.
*
* max discard sectors is set to the minimum between max_discard_seg and
* max_secure_erase_seg.
*/
if (virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
virtio_cread(vdev, struct virtio_blk_config,
secure_erase_sector_alignment, &v);
/* secure_erase_sector_alignment should not be zero, the device should set a
* valid number of sectors.
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: secure_erase_sector_alignment can't be 0\n");
return -EINVAL;
}
discard_granularity = min_not_zero(discard_granularity, v);
virtio_cread(vdev, struct virtio_blk_config,
max_secure_erase_sectors, &v);
/* max_secure_erase_sectors should not be zero, the device should set a
* valid number of sectors.
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: max_secure_erase_sectors can't be 0\n");
return -EINVAL;
}
lim->max_secure_erase_sectors = v;
virtio_cread(vdev, struct virtio_blk_config,
max_secure_erase_seg, &v);
/* max_secure_erase_seg should not be zero, the device should set a
* valid number of segments
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: max_secure_erase_seg can't be 0\n");
return -EINVAL;
}
max_discard_segs = min_not_zero(max_discard_segs, v);
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD) ||
virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
/* max_discard_seg and discard_granularity will be 0 only
* if max_discard_seg and discard_sector_alignment fields in the virtio
* config are 0 and VIRTIO_BLK_F_SECURE_ERASE feature is not negotiated.
* In this case, we use default values.
*/
if (!max_discard_segs)
max_discard_segs = sg_elems;
lim->max_discard_segments =
min(max_discard_segs, MAX_DISCARD_SEGMENTS);
if (discard_granularity)
lim->discard_granularity =
discard_granularity << SECTOR_SHIFT;
else
lim->discard_granularity = blk_size;
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) {
u8 model;
virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model);
switch (model) {
case VIRTIO_BLK_Z_NONE:
case VIRTIO_BLK_Z_HA:
/* treat host-aware devices as non-zoned */
return 0;
case VIRTIO_BLK_Z_HM:
err = virtblk_read_zoned_limits(vblk, lim);
if (err)
return err;
break;
default:
dev_err(&vdev->dev, "unsupported zone model %d\n", model);
return -EINVAL;
}
}
return 0;
}
static int virtblk_probe(struct virtio_device *vdev)
{
struct virtio_blk *vblk;
struct queue_limits lim = { };
int err, index;
unsigned int queue_depth;
if (!vdev->config->get) {
dev_err(&vdev->dev, "%s failure: config access disabled\n",
__func__);
return -EINVAL;
}
err = ida_alloc_range(&vd_index_ida, 0,
minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL);
if (err < 0)
goto out;
index = err;
vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
if (!vblk) {
err = -ENOMEM;
@ -1330,12 +1508,15 @@ static int virtblk_probe(struct virtio_device *vdev)
if (err)
goto out_free_vq;
vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk);
err = virtblk_read_limits(vblk, &lim);
if (err)
goto out_free_tags;
vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, &lim, vblk);
if (IS_ERR(vblk->disk)) {
err = PTR_ERR(vblk->disk);
goto out_free_tags;
}
q = vblk->disk->queue;
virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
@ -1353,164 +1534,6 @@ static int virtblk_probe(struct virtio_device *vdev)
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
set_disk_ro(vblk->disk, 1);
/* We can handle whatever the host told us to handle. */
blk_queue_max_segments(q, sg_elems);
/* No real sector limit. */
blk_queue_max_hw_sectors(q, UINT_MAX);
max_dma_size = virtio_max_dma_size(vdev);
max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size;
/* Host can optionally specify maximum segment size and number of
* segments. */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
struct virtio_blk_config, size_max, &v);
if (!err)
max_size = min(max_size, v);
blk_queue_max_segment_size(q, max_size);
/* Host can optionally specify the block size of the device */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
struct virtio_blk_config, blk_size,
&blk_size);
if (!err) {
err = blk_validate_block_size(blk_size);
if (err) {
dev_err(&vdev->dev,
"virtio_blk: invalid block size: 0x%x\n",
blk_size);
goto out_cleanup_disk;
}
blk_queue_logical_block_size(q, blk_size);
} else
blk_size = queue_logical_block_size(q);
/* Use topology information if available */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, physical_block_exp,
&physical_block_exp);
if (!err && physical_block_exp)
blk_queue_physical_block_size(q,
blk_size * (1 << physical_block_exp));
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, alignment_offset,
&alignment_offset);
if (!err && alignment_offset)
blk_queue_alignment_offset(q, blk_size * alignment_offset);
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, min_io_size,
&min_io_size);
if (!err && min_io_size)
blk_queue_io_min(q, blk_size * min_io_size);
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, opt_io_size,
&opt_io_size);
if (!err && opt_io_size)
blk_queue_io_opt(q, blk_size * opt_io_size);
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
virtio_cread(vdev, struct virtio_blk_config,
discard_sector_alignment, &discard_granularity);
virtio_cread(vdev, struct virtio_blk_config,
max_discard_sectors, &v);
blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);
virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
&max_discard_segs);
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
virtio_cread(vdev, struct virtio_blk_config,
max_write_zeroes_sectors, &v);
blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
}
/* The discard and secure erase limits are combined since the Linux
* block layer uses the same limit for both commands.
*
* If both VIRTIO_BLK_F_SECURE_ERASE and VIRTIO_BLK_F_DISCARD features
* are negotiated, we will use the minimum between the limits.
*
* discard sector alignment is set to the minimum between discard_sector_alignment
* and secure_erase_sector_alignment.
*
* max discard sectors is set to the minimum between max_discard_seg and
* max_secure_erase_seg.
*/
if (virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
virtio_cread(vdev, struct virtio_blk_config,
secure_erase_sector_alignment, &v);
/* secure_erase_sector_alignment should not be zero, the device should set a
* valid number of sectors.
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: secure_erase_sector_alignment can't be 0\n");
err = -EINVAL;
goto out_cleanup_disk;
}
discard_granularity = min_not_zero(discard_granularity, v);
virtio_cread(vdev, struct virtio_blk_config,
max_secure_erase_sectors, &v);
/* max_secure_erase_sectors should not be zero, the device should set a
* valid number of sectors.
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: max_secure_erase_sectors can't be 0\n");
err = -EINVAL;
goto out_cleanup_disk;
}
blk_queue_max_secure_erase_sectors(q, v);
virtio_cread(vdev, struct virtio_blk_config,
max_secure_erase_seg, &v);
/* max_secure_erase_seg should not be zero, the device should set a
* valid number of segments
*/
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: max_secure_erase_seg can't be 0\n");
err = -EINVAL;
goto out_cleanup_disk;
}
max_discard_segs = min_not_zero(max_discard_segs, v);
}
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD) ||
virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
/* max_discard_seg and discard_granularity will be 0 only
* if max_discard_seg and discard_sector_alignment fields in the virtio
* config are 0 and VIRTIO_BLK_F_SECURE_ERASE feature is not negotiated.
* In this case, we use default values.
*/
if (!max_discard_segs)
max_discard_segs = sg_elems;
blk_queue_max_discard_segments(q,
min(max_discard_segs, MAX_DISCARD_SEGMENTS));
if (discard_granularity)
q->limits.discard_granularity = discard_granularity << SECTOR_SHIFT;
else
q->limits.discard_granularity = blk_size;
}
virtblk_update_capacity(vblk, false);
virtio_device_ready(vdev);
@ -1518,27 +1541,11 @@ static int virtblk_probe(struct virtio_device *vdev)
* All steps that follow use the VQs therefore they need to be
* placed after the virtio_device_ready() call above.
*/
if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) {
u8 model;
virtio_cread(vdev, struct virtio_blk_config, zoned.model,
&model);
switch (model) {
case VIRTIO_BLK_Z_NONE:
case VIRTIO_BLK_Z_HA:
/* Present the host-aware device as non-zoned */
break;
case VIRTIO_BLK_Z_HM:
err = virtblk_probe_zoned_device(vdev, vblk, q);
if (err)
goto out_cleanup_disk;
break;
default:
dev_err(&vdev->dev, "unsupported zone model %d\n",
model);
err = -EINVAL;
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) {
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
err = blk_revalidate_disk_zones(vblk->disk, NULL);
if (err)
goto out_cleanup_disk;
}
}
err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);

View File

@ -941,39 +941,35 @@ static const struct blk_mq_ops blkfront_mq_ops = {
.complete = blkif_complete_rq,
};
static void blkif_set_queue_limits(struct blkfront_info *info)
static void blkif_set_queue_limits(const struct blkfront_info *info,
struct queue_limits *lim)
{
struct request_queue *rq = info->rq;
struct gendisk *gd = info->gd;
unsigned int segments = info->max_indirect_segments ? :
BLKIF_MAX_SEGMENTS_PER_REQUEST;
blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
if (info->feature_discard) {
blk_queue_max_discard_sectors(rq, get_capacity(gd));
rq->limits.discard_granularity = info->discard_granularity ?:
info->physical_sector_size;
rq->limits.discard_alignment = info->discard_alignment;
lim->max_hw_discard_sectors = UINT_MAX;
if (info->discard_granularity)
lim->discard_granularity = info->discard_granularity;
lim->discard_alignment = info->discard_alignment;
if (info->feature_secdiscard)
blk_queue_max_secure_erase_sectors(rq,
get_capacity(gd));
lim->max_secure_erase_sectors = UINT_MAX;
}
/* Hard sector size and max sectors impersonate the equiv. hardware. */
blk_queue_logical_block_size(rq, info->sector_size);
blk_queue_physical_block_size(rq, info->physical_sector_size);
blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512);
lim->logical_block_size = info->sector_size;
lim->physical_block_size = info->physical_sector_size;
lim->max_hw_sectors = (segments * XEN_PAGE_SIZE) / 512;
/* Each segment in a request is up to an aligned page in size. */
blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
blk_queue_max_segment_size(rq, PAGE_SIZE);
lim->seg_boundary_mask = PAGE_SIZE - 1;
lim->max_segment_size = PAGE_SIZE;
/* Ensure a merged request will fit in a single I/O ring slot. */
blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG);
lim->max_segments = segments / GRANTS_PER_PSEG;
/* Make sure buffer addresses are sector-aligned. */
blk_queue_dma_alignment(rq, 511);
lim->dma_alignment = 511;
}
static const char *flush_info(struct blkfront_info *info)
@ -1070,6 +1066,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info, u16 sector_size,
unsigned int physical_sector_size)
{
struct queue_limits lim = {};
struct gendisk *gd;
int nr_minors = 1;
int err;
@ -1136,11 +1133,13 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
if (err)
goto out_release_minors;
gd = blk_mq_alloc_disk(&info->tag_set, info);
blkif_set_queue_limits(info, &lim);
gd = blk_mq_alloc_disk(&info->tag_set, &lim, info);
if (IS_ERR(gd)) {
err = PTR_ERR(gd);
goto out_free_tag_set;
}
blk_queue_flag_set(QUEUE_FLAG_VIRT, gd->queue);
strcpy(gd->disk_name, DEV_NAME);
ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
@ -1162,7 +1161,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
info->gd = gd;
info->sector_size = sector_size;
info->physical_sector_size = physical_sector_size;
blkif_set_queue_limits(info);
xlvbd_flush(info);
@ -2006,18 +2004,19 @@ static int blkfront_probe(struct xenbus_device *dev,
static int blkif_recover(struct blkfront_info *info)
{
struct queue_limits lim;
unsigned int r_index;
struct request *req, *n;
int rc;
struct bio *bio;
unsigned int segs;
struct blkfront_ring_info *rinfo;
lim = queue_limits_start_update(info->rq);
blkfront_gather_backend_features(info);
/* Reset limits changed by blk_mq_update_nr_hw_queues(). */
blkif_set_queue_limits(info);
segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG);
blkif_set_queue_limits(info, &lim);
rc = queue_limits_commit_update(info->rq, &lim);
if (rc)
return rc;
for_each_rinfo(info, rinfo, r_index) {
rc = blkfront_setup_indirect(rinfo);
@ -2037,7 +2036,9 @@ static int blkif_recover(struct blkfront_info *info)
list_for_each_entry_safe(req, n, &info->requests, queuelist) {
/* Requeue pending requests (flush or discard) */
list_del_init(&req->queuelist);
BUG_ON(req->nr_phys_segments > segs);
BUG_ON(req->nr_phys_segments >
(info->max_indirect_segments ? :
BLKIF_MAX_SEGMENTS_PER_REQUEST));
blk_mq_requeue_request(req, false);
}
blk_mq_start_stopped_hw_queues(info->rq, true);

View File

@ -318,7 +318,7 @@ static int z2ram_register_disk(int minor)
struct gendisk *disk;
int err;
disk = blk_mq_alloc_disk(&tag_set, NULL);
disk = blk_mq_alloc_disk(&tag_set, NULL, NULL);
if (IS_ERR(disk))
return PTR_ERR(disk);

View File

@ -2177,6 +2177,28 @@ ATTRIBUTE_GROUPS(zram_disk);
*/
static int zram_add(void)
{
struct queue_limits lim = {
.logical_block_size = ZRAM_LOGICAL_BLOCK_SIZE,
/*
* To ensure that we always get PAGE_SIZE aligned and
* n*PAGE_SIZED sized I/O requests.
*/
.physical_block_size = PAGE_SIZE,
.io_min = PAGE_SIZE,
.io_opt = PAGE_SIZE,
.max_hw_discard_sectors = UINT_MAX,
/*
* zram_bio_discard() will clear all logical blocks if logical
* block size is identical with physical block size(PAGE_SIZE).
* But if it is different, we will skip discarding some parts of
* logical blocks in the part of the request range which isn't
* aligned to physical block size. So we can't ensure that all
* discarded logical blocks are zeroed.
*/
#if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
.max_write_zeroes_sectors = UINT_MAX,
#endif
};
struct zram *zram;
int ret, device_id;
@ -2195,11 +2217,11 @@ static int zram_add(void)
#endif
/* gendisk structure */
zram->disk = blk_alloc_disk(NUMA_NO_NODE);
if (!zram->disk) {
zram->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(zram->disk)) {
pr_err("Error allocating disk structure for device %d\n",
device_id);
ret = -ENOMEM;
ret = PTR_ERR(zram->disk);
goto out_free_idr;
}
@ -2216,29 +2238,6 @@ static int zram_add(void)
/* zram devices sort of resembles non-rotational disks */
blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue);
/*
* To ensure that we always get PAGE_SIZE aligned
* and n*PAGE_SIZED sized I/O requests.
*/
blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
blk_queue_logical_block_size(zram->disk->queue,
ZRAM_LOGICAL_BLOCK_SIZE);
blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
/*
* zram_bio_discard() will clear all logical blocks if logical block
* size is identical with physical block size(PAGE_SIZE). But if it is
* different, we will skip discarding some parts of logical blocks in
* the part of the request range which isn't aligned to physical block
* size. So we can't ensure that all discarded logical blocks are
* zeroed.
*/
if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
if (ret)

View File

@ -724,11 +724,6 @@ static void probe_gdrom_setupdisk(void)
static int probe_gdrom_setupqueue(void)
{
blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
/* using DMA so memory will need to be contiguous */
blk_queue_max_segments(gd.gdrom_rq, 1);
/* set a large max size to get most from DMA */
blk_queue_max_segment_size(gd.gdrom_rq, 0x40000);
gd.disk->queue = gd.gdrom_rq;
return gdrom_init_dma_mode();
}
@ -743,6 +738,13 @@ static const struct blk_mq_ops gdrom_mq_ops = {
*/
static int probe_gdrom(struct platform_device *devptr)
{
struct queue_limits lim = {
.logical_block_size = GDROM_HARD_SECTOR,
/* using DMA so memory will need to be contiguous */
.max_segments = 1,
/* set a large max size to get most from DMA */
.max_segment_size = 0x40000,
};
int err;
/*
@ -778,7 +780,7 @@ static int probe_gdrom(struct platform_device *devptr)
if (err)
goto probe_fail_free_cd_info;
gd.disk = blk_mq_alloc_disk(&gd.tag_set, NULL);
gd.disk = blk_mq_alloc_disk(&gd.tag_set, &lim, NULL);
if (IS_ERR(gd.disk)) {
err = PTR_ERR(gd.disk);
goto probe_fail_free_tag_set;
@ -829,7 +831,7 @@ probe_fail_no_mem:
return err;
}
static int remove_gdrom(struct platform_device *devptr)
static void remove_gdrom(struct platform_device *devptr)
{
blk_mq_free_tag_set(&gd.tag_set);
free_irq(HW_EVENT_GDROM_CMD, &gd);
@ -840,13 +842,11 @@ static int remove_gdrom(struct platform_device *devptr)
unregister_cdrom(gd.cd_info);
kfree(gd.cd_info);
kfree(gd.toc);
return 0;
}
static struct platform_driver gdrom_driver = {
.probe = probe_gdrom,
.remove = remove_gdrom,
.remove_new = remove_gdrom,
.driver = {
.name = GDROM_DEV_NAME,
},

View File

@ -900,9 +900,23 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t));
struct queue_limits lim = {
.max_hw_sectors = UINT_MAX,
.max_sectors = UINT_MAX,
.max_segment_size = UINT_MAX,
.max_segments = BIO_MAX_VECS,
.max_hw_discard_sectors = UINT_MAX,
.io_min = block_size,
.logical_block_size = block_size,
.physical_block_size = block_size,
};
uint64_t n;
int idx;
if (cached_bdev) {
d->stripe_size = bdev_io_opt(cached_bdev) >> SECTOR_SHIFT;
lim.io_opt = umax(block_size, bdev_io_opt(cached_bdev));
}
if (!d->stripe_size)
d->stripe_size = 1 << 31;
else if (d->stripe_size < BCH_MIN_STRIPE_SZ)
@ -935,8 +949,21 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
goto out_ida_remove;
d->disk = blk_alloc_disk(NUMA_NO_NODE);
if (!d->disk)
if (lim.logical_block_size > PAGE_SIZE && cached_bdev) {
/*
* This should only happen with BCACHE_SB_VERSION_BDEV.
* Block/page size is checked for BCACHE_SB_VERSION_CDEV.
*/
pr_info("bcache%i: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
idx, lim.logical_block_size,
PAGE_SIZE, bdev_logical_block_size(cached_bdev));
/* This also adjusts physical block size/min io size if needed */
lim.logical_block_size = bdev_logical_block_size(cached_bdev);
}
d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(d->disk))
goto out_bioset_exit;
set_capacity(d->disk, sectors);
@ -949,27 +976,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
d->disk->private_data = d;
q = d->disk->queue;
q->limits.max_hw_sectors = UINT_MAX;
q->limits.max_sectors = UINT_MAX;
q->limits.max_segment_size = UINT_MAX;
q->limits.max_segments = BIO_MAX_VECS;
blk_queue_max_discard_sectors(q, UINT_MAX);
q->limits.io_min = block_size;
q->limits.logical_block_size = block_size;
q->limits.physical_block_size = block_size;
if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
/*
* This should only happen with BCACHE_SB_VERSION_BDEV.
* Block/page size is checked for BCACHE_SB_VERSION_CDEV.
*/
pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
d->disk->disk_name, q->limits.logical_block_size,
PAGE_SIZE, bdev_logical_block_size(cached_bdev));
/* This also adjusts physical block size/min io size if needed */
blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
}
blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
@ -1416,9 +1422,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
}
dc->disk.stripe_size = q->limits.io_opt >> 9;
if (dc->disk.stripe_size)
if (bdev_io_opt(dc->bdev))
dc->partial_stripes_expensive =
q->limits.raid_partial_stripes_expensive;
@ -1428,9 +1432,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
if (ret)
return ret;
blk_queue_io_opt(dc->disk.disk->queue,
max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
atomic_set(&dc->io_errors, 0);
dc->io_disable = false;
dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;

View File

@ -213,6 +213,7 @@ struct raid_dev {
#define RT_FLAG_RS_IN_SYNC 6
#define RT_FLAG_RS_RESYNCING 7
#define RT_FLAG_RS_GROW 8
#define RT_FLAG_RS_FROZEN 9
/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@ -3240,11 +3241,12 @@ size_check:
rs->md.ro = 1;
rs->md.in_sync = 1;
/* Keep array frozen until resume. */
set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
/* Has to be held on running the array */
mddev_suspend_and_lock_nointr(&rs->md);
/* Keep array frozen until resume. */
md_frozen_sync_thread(&rs->md);
r = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */
if (r) {
@ -3339,7 +3341,8 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
return DM_MAPIO_REQUEUE;
md_handle_request(mddev, bio);
if (unlikely(!md_handle_request(mddev, bio)))
return DM_MAPIO_REQUEUE;
return DM_MAPIO_SUBMITTED;
}
@ -3718,21 +3721,33 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
{
struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md;
int ret = 0;
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
if (!strcasecmp(argv[0], "frozen"))
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
else
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (test_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags) ||
test_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags))
return -EBUSY;
if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
}
} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
if (!strcasecmp(argv[0], "frozen")) {
ret = mddev_lock(mddev);
if (ret)
return ret;
md_frozen_sync_thread(mddev);
mddev_unlock(mddev);
} else if (!strcasecmp(argv[0], "idle")) {
ret = mddev_lock(mddev);
if (ret)
return ret;
md_idle_sync_thread(mddev);
mddev_unlock(mddev);
}
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
return -EBUSY;
else if (!strcasecmp(argv[0], "resync"))
; /* MD_RECOVERY_NEEDED set below */
@ -3791,15 +3806,46 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
}
static void raid_presuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md;
/*
* From now on, disallow raid_message() to change sync_thread until
* resume, raid_postsuspend() is too late.
*/
set_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
if (!reshape_interrupted(mddev))
return;
/*
* For raid456, if reshape is interrupted, IO across reshape position
* will never make progress, while caller will wait for IO to be done.
* Inform raid456 to handle those IO to prevent deadlock.
*/
if (mddev->pers && mddev->pers->prepare_suspend)
mddev->pers->prepare_suspend(mddev);
}
static void raid_presuspend_undo(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
}
static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
/* Writes have to be stopped before suspending to avoid deadlocks. */
if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
md_stop_writes(&rs->md);
/*
* sync_thread must be stopped during suspend, and writes have
* to be stopped before suspending to avoid deadlocks.
*/
md_stop_writes(&rs->md);
mddev_suspend(&rs->md, false);
}
}
@ -4012,8 +4058,6 @@ static int raid_preresume(struct dm_target *ti)
}
/* Check for any resize/reshape on @rs and adjust/initiate */
/* Be prepared for mddev_resume() in raid_resume() */
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
mddev->resync_min = mddev->recovery_cp;
@ -4047,7 +4091,9 @@ static void raid_resume(struct dm_target *ti)
* Take this opportunity to check whether any failed
* devices are reachable again.
*/
mddev_lock_nointr(mddev);
attempt_restore_of_faulty_devices(rs);
mddev_unlock(mddev);
}
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
@ -4055,10 +4101,13 @@ static void raid_resume(struct dm_target *ti)
if (mddev->delta_disks < 0)
rs_set_capacity(rs);
WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery));
WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
mddev_lock_nointr(mddev);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
mddev->ro = 0;
mddev->in_sync = 0;
md_unfrozen_sync_thread(mddev);
mddev_unlock_and_resume(mddev);
}
}
@ -4074,6 +4123,8 @@ static struct target_type raid_target = {
.message = raid_message,
.iterate_devices = raid_iterate_devices,
.io_hints = raid_io_hints,
.presuspend = raid_presuspend,
.presuspend_undo = raid_presuspend_undo,
.postsuspend = raid_postsuspend,
.preresume = raid_preresume,
.resume = raid_resume,

View File

@ -1963,26 +1963,27 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
bool wc = false, fua = false;
int r;
/*
* Copy table's limits to the DM device's request_queue
*/
q->limits = *limits;
if (dm_table_supports_nowait(t))
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
else
blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q);
if (!dm_table_supports_discards(t)) {
q->limits.max_discard_sectors = 0;
q->limits.max_hw_discard_sectors = 0;
q->limits.discard_granularity = 0;
q->limits.discard_alignment = 0;
q->limits.discard_misaligned = 0;
limits->max_hw_discard_sectors = 0;
limits->discard_granularity = 0;
limits->discard_alignment = 0;
limits->discard_misaligned = 0;
}
if (!dm_table_supports_write_zeroes(t))
limits->max_write_zeroes_sectors = 0;
if (!dm_table_supports_secure_erase(t))
q->limits.max_secure_erase_sectors = 0;
limits->max_secure_erase_sectors = 0;
r = queue_limits_set(q, limits);
if (r)
return r;
if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
wc = true;
@ -2007,9 +2008,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
if (!dm_table_supports_write_zeroes(t))
q->limits.max_write_zeroes_sectors = 0;
dm_table_verify_integrity(t);
/*
@ -2047,7 +2045,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
dm_update_crypto_profile(q, t);
disk_update_readahead(t->md->disk);
/*
* Check for request-based device is left to

View File

@ -1655,10 +1655,13 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
struct dmz_dev *dev = zone->dev;
unsigned int noio_flag;
noio_flag = memalloc_noio_save();
ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
dmz_start_sect(zmd, zone),
zmd->zone_nr_sectors, GFP_NOIO);
zmd->zone_nr_sectors);
memalloc_noio_restore(noio_flag);
if (ret) {
dmz_dev_err(dev, "Reset zone %u failed %d",
zone->id, ret);

View File

@ -2101,8 +2101,8 @@ static struct mapped_device *alloc_dev(int minor)
* established. If request-based table is loaded: blk-mq will
* override accordingly.
*/
md->disk = blk_alloc_disk(md->numa_node_id);
if (!md->disk)
md->disk = blk_alloc_disk(NULL, md->numa_node_id);
if (IS_ERR(md->disk))
goto bad;
md->queue = md->disk->queue;

View File

@ -234,7 +234,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
sector_t doff;
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
if (pg_index == store->file_pages - 1) {
/* we compare length (page numbers), not page offset. */
if ((pg_index - store->sb_index) == store->file_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (last_page_size == 0)
@ -438,8 +439,8 @@ static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
struct page *page = store->filemap[pg_index];
if (mddev_is_clustered(bitmap->mddev)) {
pg_index += bitmap->cluster_slot *
DIV_ROUND_UP(store->bytes, PAGE_SIZE);
/* go to node bitmap area starting point */
pg_index += store->sb_index;
}
if (store->file)
@ -952,6 +953,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0;
index += store->sb_index;
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
@ -982,6 +984,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0;
index += store->sb_index;
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
@ -1043,9 +1046,8 @@ void md_bitmap_unplug(struct bitmap *bitmap)
if (dirty || need_write) {
if (!writing) {
md_bitmap_wait_writes(bitmap);
if (bitmap->mddev->queue)
blk_add_trace_msg(bitmap->mddev->queue,
"md bitmap_unplug");
mddev_add_trace_msg(bitmap->mddev,
"md bitmap_unplug");
}
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
filemap_write_page(bitmap, i, false);
@ -1316,9 +1318,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
}
bitmap->allclean = 1;
if (bitmap->mddev->queue)
blk_add_trace_msg(bitmap->mddev->queue,
"md bitmap_daemon_work");
mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work");
/* Any file-page which is PENDING now needs to be written.
* So set NEEDWRITE now, then after we make any last-minute changes

View File

@ -1,17 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINEAR_H
#define _LINEAR_H
struct dev_info {
struct md_rdev *rdev;
sector_t end_sector;
};
struct linear_conf
{
struct rcu_head rcu;
sector_t array_sectors;
int raid_disks; /* a copy of mddev->raid_disks */
struct dev_info disks[] __counted_by(raid_disks);
};
#endif

View File

@ -1,32 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _MULTIPATH_H
#define _MULTIPATH_H
struct multipath_info {
struct md_rdev *rdev;
};
struct mpconf {
struct mddev *mddev;
struct multipath_info *multipaths;
int raid_disks;
spinlock_t device_lock;
struct list_head retry_list;
mempool_t pool;
};
/*
* this is our 'private' 'collective' MULTIPATH buffer head.
* it contains information about what kind of IO operations were started
* for this MULTIPATH operation, and about their status:
*/
struct multipath_bh {
struct mddev *mddev;
struct bio *master_bio;
struct bio bio;
int path;
struct list_head retry_list;
};
#endif

View File

@ -65,7 +65,6 @@
#include <linux/percpu-refcount.h>
#include <linux/part_stat.h>
#include <trace/events/block.h>
#include "md.h"
#include "md-bitmap.h"
#include "md-cluster.h"
@ -99,18 +98,6 @@ static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
enum md_ro_state {
MD_RDWR,
MD_RDONLY,
MD_AUTO_READ,
MD_MAX_STATE
};
static bool md_is_rdwr(struct mddev *mddev)
{
return (mddev->ro == MD_RDWR);
}
/*
* Default number of read corrections we'll attempt on an rdev
* before ejecting it from the array. We divide the read error
@ -378,7 +365,7 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio)
return true;
}
void md_handle_request(struct mddev *mddev, struct bio *bio)
bool md_handle_request(struct mddev *mddev, struct bio *bio)
{
check_suspended:
if (is_suspended(mddev, bio)) {
@ -386,7 +373,7 @@ check_suspended:
/* Bail out if REQ_NOWAIT is set for the bio */
if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio);
return;
return true;
}
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
@ -402,10 +389,13 @@ check_suspended:
if (!mddev->pers->make_request(mddev, bio)) {
percpu_ref_put(&mddev->active_io);
if (!mddev->gendisk && mddev->pers->prepare_suspend)
return false;
goto check_suspended;
}
percpu_ref_put(&mddev->active_io);
return true;
}
EXPORT_SYMBOL(md_handle_request);
@ -529,6 +519,24 @@ void mddev_resume(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_resume);
/* sync bdev before setting device to readonly or stopping raid*/
static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
{
mutex_lock(&mddev->open_mutex);
if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
mutex_unlock(&mddev->open_mutex);
return -EBUSY;
}
if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
mutex_unlock(&mddev->open_mutex);
return -EBUSY;
}
mutex_unlock(&mddev->open_mutex);
sync_blockdev(mddev->gendisk->part0);
return 0;
}
/*
* Generic flush handling for md
*/
@ -2406,7 +2414,7 @@ int md_integrity_register(struct mddev *mddev)
if (list_empty(&mddev->disks))
return 0; /* nothing to do */
if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk))
return 0; /* shouldn't register, or already is */
rdev_for_each(rdev, mddev) {
/* skip spares and non-functional disks */
@ -2459,7 +2467,7 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
struct blk_integrity *bi_mddev;
if (!mddev->gendisk)
if (mddev_is_dm(mddev))
return 0;
bi_mddev = blk_get_integrity(mddev->gendisk);
@ -2566,6 +2574,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
fail:
pr_warn("md: failed to register dev-%s for %s\n",
b, mdname(mddev));
mddev_destroy_serial_pool(mddev, rdev);
return err;
}
@ -2595,7 +2604,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev);
mddev_destroy_serial_pool(rdev->mddev, rdev);
rdev->mddev = NULL;
WRITE_ONCE(rdev->mddev, NULL);
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
sysfs_put(rdev->sysfs_unack_badblocks);
@ -2851,8 +2860,7 @@ repeat:
pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
mdname(mddev), mddev->in_sync);
if (mddev->queue)
blk_add_trace_msg(mddev->queue, "md md_update_sb");
mddev_add_trace_msg(mddev, "md md_update_sb");
rewrite:
md_bitmap_update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) {
@ -2933,7 +2941,6 @@ static int add_bound_rdev(struct md_rdev *rdev)
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_new_event();
md_wakeup_thread(mddev->thread);
return 0;
}
@ -3048,10 +3055,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
if (err == 0) {
md_kick_rdev_from_array(rdev);
if (mddev->pers) {
if (mddev->pers)
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
}
md_new_event();
}
}
@ -3081,7 +3086,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
err = 0;
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
@ -3119,7 +3123,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
err = 0;
} else if (cmd_match(buf, "-want_replacement")) {
/* Clearing 'want_replacement' is always allowed.
@ -3249,7 +3252,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->raid_disk >= 0)
return -EBUSY;
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) {
/* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here.
@ -3343,8 +3345,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev,
if (kstrtoull(buf, 10, &new_offset) < 0)
return -EINVAL;
if (mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
if (new_offset == rdev->data_offset)
/* reset is always permitted */
@ -3675,7 +3676,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
struct kernfs_node *kn = NULL;
bool suspend = false;
ssize_t rv;
struct mddev *mddev = rdev->mddev;
struct mddev *mddev = READ_ONCE(rdev->mddev);
if (!entry->store)
return -EIO;
@ -4017,8 +4018,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
*/
rv = -EBUSY;
if (mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->reshape_position != MaxSector ||
mddev->sysfs_active)
goto out_unlock;
@ -4168,7 +4168,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->in_sync = 1;
del_timer_sync(&mddev->safemode_timer);
}
blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
if (!mddev->thread)
@ -4475,8 +4474,8 @@ array_state_show(struct mddev *mddev, char *page)
return sprintf(page, "%s\n", array_states[st]);
}
static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
static int do_md_stop(struct mddev *mddev, int ro);
static int md_set_readonly(struct mddev *mddev);
static int restart_array(struct mddev *mddev);
static ssize_t
@ -4493,6 +4492,17 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case broken: /* cannot be set */
case bad_word:
return -EINVAL;
case clear:
case readonly:
case inactive:
case read_auto:
if (!mddev->pers || !md_is_rdwr(mddev))
break;
/* write sysfs will not open mddev and opener should be 0 */
err = mddev_set_closing_and_sync_blockdev(mddev, 0);
if (err)
return err;
break;
default:
break;
}
@ -4526,14 +4536,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case inactive:
/* stop an active array, return 0 otherwise */
if (mddev->pers)
err = do_md_stop(mddev, 2, NULL);
err = do_md_stop(mddev, 2);
break;
case clear:
err = do_md_stop(mddev, 0, NULL);
err = do_md_stop(mddev, 0);
break;
case readonly:
if (mddev->pers)
err = md_set_readonly(mddev, NULL);
err = md_set_readonly(mddev);
else {
mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1);
@ -4543,7 +4553,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case read_auto:
if (mddev->pers) {
if (md_is_rdwr(mddev))
err = md_set_readonly(mddev, NULL);
err = md_set_readonly(mddev);
else if (mddev->ro == MD_RDONLY)
err = restart_array(mddev);
if (err == 0) {
@ -4592,6 +4602,11 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
mddev_unlock(mddev);
if (st == readonly || st == read_auto || st == inactive ||
(err && st == clear))
clear_bit(MD_CLOSING, &mddev->flags);
return err ?: len;
}
static struct md_sysfs_entry md_array_state =
@ -4919,6 +4934,35 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
mddev_lock_nointr(mddev);
}
void md_idle_sync_thread(struct mddev *mddev)
{
lockdep_assert_held(&mddev->reconfig_mutex);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
stop_sync_thread(mddev, true, true);
}
EXPORT_SYMBOL_GPL(md_idle_sync_thread);
void md_frozen_sync_thread(struct mddev *mddev)
{
lockdep_assert_held(&mddev->reconfig_mutex);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
stop_sync_thread(mddev, true, false);
}
EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
void md_unfrozen_sync_thread(struct mddev *mddev)
{
lockdep_assert_held(&mddev->reconfig_mutex);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
}
EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
static void idle_sync_thread(struct mddev *mddev)
{
mutex_lock(&mddev->sync_mutex);
@ -5710,6 +5754,51 @@ static const struct kobj_type md_ktype = {
int mdp_major = 0;
/* stack the limit for all rdevs into lim */
void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim)
{
struct md_rdev *rdev;
rdev_for_each(rdev, mddev) {
queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
mddev->gendisk->disk_name);
}
}
EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
/* apply the extra stacking limits from a new rdev into mddev */
int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
struct queue_limits lim;
if (mddev_is_dm(mddev))
return 0;
lim = queue_limits_start_update(mddev->gendisk->queue);
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
mddev->gendisk->disk_name);
return queue_limits_commit_update(mddev->gendisk->queue, &lim);
}
EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
/* update the optimal I/O size after a reshape */
void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
{
struct queue_limits lim;
if (mddev_is_dm(mddev))
return;
/* don't bother updating io_opt if we can't suspend the array */
if (mddev_suspend(mddev, false) < 0)
return;
lim = queue_limits_start_update(mddev->gendisk->queue);
lim.io_opt = lim.io_min * nr_stripes;
queue_limits_commit_update(mddev->gendisk->queue, &lim);
mddev_resume(mddev);
}
EXPORT_SYMBOL_GPL(mddev_update_io_opt);
static void mddev_delayed_delete(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
@ -5774,10 +5863,11 @@ struct mddev *md_alloc(dev_t dev, char *name)
*/
mddev->hold_active = UNTIL_STOP;
error = -ENOMEM;
disk = blk_alloc_disk(NUMA_NO_NODE);
if (!disk)
disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
if (IS_ERR(disk)) {
error = PTR_ERR(disk);
goto out_free_mddev;
}
disk->major = MAJOR(mddev->unit);
disk->first_minor = unit << shift;
@ -5791,9 +5881,7 @@ struct mddev *md_alloc(dev_t dev, char *name)
disk->fops = &md_fops;
disk->private_data = mddev;
mddev->queue = disk->queue;
blk_set_stacking_limits(&mddev->queue->limits);
blk_queue_write_cache(mddev->queue, true, true);
blk_queue_write_cache(disk->queue, true, true);
disk->events |= DISK_EVENT_MEDIA_CHANGE;
mddev->gendisk = disk;
error = add_disk(disk);
@ -5935,7 +6023,7 @@ int md_run(struct mddev *mddev)
invalidate_bdev(rdev->bdev);
if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
mddev->ro = MD_RDONLY;
if (mddev->gendisk)
if (!mddev_is_dm(mddev))
set_disk_ro(mddev->gendisk, 1);
}
@ -6038,7 +6126,10 @@ int md_run(struct mddev *mddev)
pr_warn("True protection against single-disk failure might be compromised.\n");
}
mddev->recovery = 0;
/* dm-raid expect sync_thread to be frozen until resume */
if (mddev->gendisk)
mddev->recovery = 0;
/* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors;
@ -6094,7 +6185,8 @@ int md_run(struct mddev *mddev)
}
}
if (mddev->queue) {
if (!mddev_is_dm(mddev)) {
struct request_queue *q = mddev->gendisk->queue;
bool nonrot = true;
rdev_for_each(rdev, mddev) {
@ -6106,14 +6198,14 @@ int md_run(struct mddev *mddev)
if (mddev->degraded)
nonrot = false;
if (nonrot)
blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
else
blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue);
blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
/* Set the NOWAIT flags if all underlying devices support it */
if (nowait)
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
}
if (pers->sync_request) {
if (mddev->kobj.sd &&
@ -6192,7 +6284,6 @@ int do_md_run(struct mddev *mddev)
/* run start up tasks that require md_thread */
md_start(mddev);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
@ -6213,7 +6304,6 @@ int md_start(struct mddev *mddev)
if (mddev->pers->start) {
set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
md_wakeup_thread(mddev->thread);
ret = mddev->pers->start(mddev);
clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
md_wakeup_thread(mddev->sync_thread);
@ -6258,7 +6348,6 @@ static int restart_array(struct mddev *mddev)
pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
/* Kick recovery or resync if necessary */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_state);
return 0;
@ -6278,7 +6367,15 @@ static void md_clean(struct mddev *mddev)
mddev->persistent = 0;
mddev->level = LEVEL_NONE;
mddev->clevel[0] = 0;
mddev->flags = 0;
/*
* Don't clear MD_CLOSING, or mddev can be opened again.
* 'hold_active != 0' means mddev is still in the creation
* process and will be used later.
*/
if (mddev->hold_active)
mddev->flags = 0;
else
mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
mddev->sb_flags = 0;
mddev->ro = MD_RDWR;
mddev->metadata_type[0] = 0;
@ -6315,7 +6412,6 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
stop_sync_thread(mddev, true, false);
del_timer_sync(&mddev->safemode_timer);
if (mddev->pers && mddev->pers->quiesce) {
@ -6340,6 +6436,8 @@ static void __md_stop_writes(struct mddev *mddev)
void md_stop_writes(struct mddev *mddev)
{
mddev_lock_nointr(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
stop_sync_thread(mddev, true, false);
__md_stop_writes(mddev);
mddev_unlock(mddev);
}
@ -6353,8 +6451,10 @@ static void mddev_detach(struct mddev *mddev)
mddev->pers->quiesce(mddev, 0);
}
md_unregister_thread(mddev, &mddev->thread);
if (mddev->queue)
blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
/* the unplug fn references 'conf' */
if (!mddev_is_dm(mddev))
blk_sync_queue(mddev->gendisk->queue);
}
static void __md_stop(struct mddev *mddev)
@ -6391,7 +6491,8 @@ void md_stop(struct mddev *mddev)
EXPORT_SYMBOL_GPL(md_stop);
static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
/* ensure 'mddev->pers' exist before calling md_set_readonly() */
static int md_set_readonly(struct mddev *mddev)
{
int err = 0;
int did_freeze = 0;
@ -6402,7 +6503,6 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
did_freeze = 1;
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
stop_sync_thread(mddev, false, false);
@ -6410,36 +6510,29 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_lock_nointr(mddev);
mutex_lock(&mddev->open_mutex);
if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
pr_warn("md: %s still in use.\n",mdname(mddev));
err = -EBUSY;
goto out;
}
if (mddev->pers) {
__md_stop_writes(mddev);
__md_stop_writes(mddev);
if (mddev->ro == MD_RDONLY) {
err = -ENXIO;
goto out;
}
mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1);
if (mddev->ro == MD_RDONLY) {
err = -ENXIO;
goto out;
}
mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1);
out:
if ((mddev->pers && !err) || did_freeze) {
if (!err || did_freeze) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
mutex_unlock(&mddev->open_mutex);
return err;
}
@ -6447,8 +6540,7 @@ out:
* 0 - completely stop and dis-assemble array
* 2 - stop but do not disassemble array
*/
static int do_md_stop(struct mddev *mddev, int mode,
struct block_device *bdev)
static int do_md_stop(struct mddev *mddev, int mode)
{
struct gendisk *disk = mddev->gendisk;
struct md_rdev *rdev;
@ -6457,22 +6549,16 @@ static int do_md_stop(struct mddev *mddev, int mode,
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
did_freeze = 1;
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
stop_sync_thread(mddev, true, false);
mutex_lock(&mddev->open_mutex);
if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
mddev->sysfs_active ||
mddev->sync_thread ||
if (mddev->sysfs_active ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
pr_warn("md: %s still in use.\n",mdname(mddev));
mutex_unlock(&mddev->open_mutex);
if (did_freeze) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
}
return -EBUSY;
}
@ -6491,13 +6577,11 @@ static int do_md_stop(struct mddev *mddev, int mode,
sysfs_unlink_rdev(mddev, rdev);
set_capacity_and_notify(disk, 0);
mutex_unlock(&mddev->open_mutex);
mddev->changed = 1;
if (!md_is_rdwr(mddev))
mddev->ro = MD_RDWR;
} else
mutex_unlock(&mddev->open_mutex);
}
/*
* Free resources if final stop
*/
@ -6543,7 +6627,7 @@ static void autorun_array(struct mddev *mddev)
err = do_md_run(mddev);
if (err) {
pr_warn("md: do_md_run() returned %d\n", err);
do_md_stop(mddev, 0, NULL);
do_md_stop(mddev, 0);
}
}
@ -7013,9 +7097,7 @@ kick_rdev:
md_kick_rdev_from_array(rdev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
if (mddev->thread)
md_wakeup_thread(mddev->thread);
else
if (!mddev->thread)
md_update_sb(mddev, 1);
md_new_event();
@ -7090,14 +7172,13 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
if (!bdev_nowait(rdev->bdev)) {
pr_info("%s: Disabling nowait because %pg does not support nowait\n",
mdname(mddev), rdev->bdev);
blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue);
}
/*
* Kick recovery, maybe this spare has to be added to the
* array immediately.
*/
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_new_event();
return 0;
@ -7311,8 +7392,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
* of each device. If num_sectors is zero, we find the largest size
* that fits.
*/
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->sync_thread)
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
if (!md_is_rdwr(mddev))
return -EROFS;
@ -7329,10 +7409,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
if (!rv) {
if (mddev_is_clustered(mddev))
md_cluster_ops->update_size(mddev, old_dev_sectors);
else if (mddev->queue) {
else if (!mddev_is_dm(mddev))
set_capacity_and_notify(mddev->gendisk,
mddev->array_sectors);
}
}
return rv;
}
@ -7349,8 +7428,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
if (raid_disks <= 0 ||
(mddev->max_disks && raid_disks >= mddev->max_disks))
return -EINVAL;
if (mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
mddev->reshape_position != MaxSector)
return -EBUSY;
@ -7546,16 +7624,17 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
static inline bool md_ioctl_valid(unsigned int cmd)
static inline int md_ioctl_valid(unsigned int cmd)
{
switch (cmd) {
case ADD_NEW_DISK:
case GET_ARRAY_INFO:
case GET_BITMAP_FILE:
case GET_DISK_INFO:
case RAID_VERSION:
return 0;
case ADD_NEW_DISK:
case GET_BITMAP_FILE:
case HOT_ADD_DISK:
case HOT_REMOVE_DISK:
case RAID_VERSION:
case RESTART_ARRAY_RW:
case RUN_ARRAY:
case SET_ARRAY_INFO:
@ -7564,9 +7643,11 @@ static inline bool md_ioctl_valid(unsigned int cmd)
case STOP_ARRAY:
case STOP_ARRAY_RO:
case CLUSTERED_DISK_NACK:
return true;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
return 0;
default:
return false;
return -ENOTTY;
}
}
@ -7624,31 +7705,17 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
int err = 0;
void __user *argp = (void __user *)arg;
struct mddev *mddev = NULL;
bool did_set_md_closing = false;
if (!md_ioctl_valid(cmd))
return -ENOTTY;
switch (cmd) {
case RAID_VERSION:
case GET_ARRAY_INFO:
case GET_DISK_INFO:
break;
default:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
}
err = md_ioctl_valid(cmd);
if (err)
return err;
/*
* Commands dealing with the RAID driver but not any
* particular array:
*/
switch (cmd) {
case RAID_VERSION:
err = get_version(argp);
goto out;
default:;
}
if (cmd == RAID_VERSION)
return get_version(argp);
/*
* Commands creating/starting a new array:
@ -7656,35 +7723,23 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
mddev = bdev->bd_disk->private_data;
if (!mddev) {
BUG();
goto out;
}
/* Some actions do not requires the mutex */
switch (cmd) {
case GET_ARRAY_INFO:
if (!mddev->raid_disks && !mddev->external)
err = -ENODEV;
else
err = get_array_info(mddev, argp);
goto out;
return -ENODEV;
return get_array_info(mddev, argp);
case GET_DISK_INFO:
if (!mddev->raid_disks && !mddev->external)
err = -ENODEV;
else
err = get_disk_info(mddev, argp);
goto out;
return -ENODEV;
return get_disk_info(mddev, argp);
case SET_DISK_FAULTY:
err = set_disk_faulty(mddev, new_decode_dev(arg));
goto out;
return set_disk_faulty(mddev, new_decode_dev(arg));
case GET_BITMAP_FILE:
err = get_bitmap_file(mddev, argp);
goto out;
return get_bitmap_file(mddev, argp);
}
if (cmd == HOT_REMOVE_DISK)
@ -7697,20 +7752,9 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
/* Need to flush page cache, and ensure no-one else opens
* and writes
*/
mutex_lock(&mddev->open_mutex);
if (mddev->pers && atomic_read(&mddev->openers) > 1) {
mutex_unlock(&mddev->open_mutex);
err = -EBUSY;
goto out;
}
if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
mutex_unlock(&mddev->open_mutex);
err = -EBUSY;
goto out;
}
did_set_md_closing = true;
mutex_unlock(&mddev->open_mutex);
sync_blockdev(bdev);
err = mddev_set_closing_and_sync_blockdev(mddev, 1);
if (err)
return err;
}
if (!md_is_rdwr(mddev))
@ -7751,11 +7795,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
goto unlock;
case STOP_ARRAY:
err = do_md_stop(mddev, 0, bdev);
err = do_md_stop(mddev, 0);
goto unlock;
case STOP_ARRAY_RO:
err = md_set_readonly(mddev, bdev);
if (mddev->pers)
err = md_set_readonly(mddev);
goto unlock;
case HOT_REMOVE_DISK:
@ -7850,7 +7895,7 @@ unlock:
mddev_unlock(mddev);
out:
if(did_set_md_closing)
if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
clear_bit(MD_CLOSING, &mddev->flags);
return err;
}
@ -8687,10 +8732,7 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
bio_chain(discard_bio, bio);
bio_clone_blkg_association(discard_bio, bio);
if (mddev->gendisk)
trace_block_bio_remap(discard_bio,
disk_devt(mddev->gendisk),
bio->bi_iter.bi_sector);
mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
submit_bio_noacct(discard_bio);
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
@ -8737,6 +8779,23 @@ void md_account_bio(struct mddev *mddev, struct bio **bio)
}
EXPORT_SYMBOL_GPL(md_account_bio);
void md_free_cloned_bio(struct bio *bio)
{
struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
if (bio->bi_status && !orig_bio->bi_status)
orig_bio->bi_status = bio->bi_status;
if (md_io_clone->start_time)
bio_end_io_acct(orig_bio, md_io_clone->start_time);
bio_put(bio);
percpu_ref_put(&mddev->active_io);
}
EXPORT_SYMBOL_GPL(md_free_cloned_bio);
/* md_allow_write(mddev)
* Calling this ensures that the array is marked 'active' so that writes
* may proceed without blocking. It is important to call this before
@ -9170,7 +9229,7 @@ void md_do_sync(struct md_thread *thread)
mddev->delta_disks > 0 &&
mddev->pers->finish_reshape &&
mddev->pers->size &&
mddev->queue) {
!mddev_is_dm(mddev)) {
mddev_lock_nointr(mddev);
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
mddev_unlock(mddev);
@ -9270,9 +9329,14 @@ static bool md_spares_need_change(struct mddev *mddev)
{
struct md_rdev *rdev;
rdev_for_each(rdev, mddev)
if (rdev_removeable(rdev) || rdev_addable(rdev))
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
if (rdev_removeable(rdev) || rdev_addable(rdev)) {
rcu_read_unlock();
return true;
}
}
rcu_read_unlock();
return false;
}

View File

@ -18,6 +18,7 @@
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <trace/events/block.h>
#include "md-cluster.h"
#define MaxSector (~(sector_t)0)
@ -207,6 +208,7 @@ enum flag_bits {
* check if there is collision between raid1
* serial bios.
*/
Nonrot, /* non-rotational device (SSD) */
};
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@ -222,6 +224,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
}
return 0;
}
static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
int sectors)
{
sector_t first_bad;
int bad_sectors;
return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
}
extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
@ -468,7 +480,6 @@ struct mddev {
struct timer_list safemode_timer;
struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */
struct request_queue *queue; /* for plugging ... */
struct bitmap *bitmap; /* the bitmap for the device */
struct {
@ -558,6 +569,37 @@ enum recovery_flags {
MD_RESYNCING_REMOTE, /* remote node is running resync thread */
};
enum md_ro_state {
MD_RDWR,
MD_RDONLY,
MD_AUTO_READ,
MD_MAX_STATE
};
static inline bool md_is_rdwr(struct mddev *mddev)
{
return (mddev->ro == MD_RDWR);
}
static inline bool reshape_interrupted(struct mddev *mddev)
{
/* reshape never start */
if (mddev->reshape_position == MaxSector)
return false;
/* interrupted */
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return true;
/* running reshape will be interrupted soon. */
if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
return true;
return false;
}
static inline int __must_check mddev_lock(struct mddev *mddev)
{
return mutex_lock_interruptible(&mddev->reconfig_mutex);
@ -617,6 +659,7 @@ struct md_personality
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
void (*update_reshape_pos) (struct mddev *mddev);
void (*prepare_suspend) (struct mddev *mddev);
/* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour
@ -750,6 +793,7 @@ extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size);
void md_account_bio(struct mddev *mddev, struct bio **bio);
void md_free_cloned_bio(struct bio *bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
@ -778,9 +822,12 @@ extern void md_stop_writes(struct mddev *mddev);
extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev);
extern void md_handle_request(struct mddev *mddev, struct bio *bio);
extern bool md_handle_request(struct mddev *mddev, struct bio *bio);
extern int mddev_suspend(struct mddev *mddev, bool interruptible);
extern void mddev_resume(struct mddev *mddev);
extern void md_idle_sync_thread(struct mddev *mddev);
extern void md_frozen_sync_thread(struct mddev *mddev);
extern void md_unfrozen_sync_thread(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
@ -821,7 +868,7 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
{
if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
!bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
mddev->queue->limits.max_write_zeroes_sectors = 0;
mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0;
}
static inline int mddev_suspend_and_lock(struct mddev *mddev)
@ -860,7 +907,31 @@ void md_autostart_arrays(int part);
int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
int do_md_run(struct mddev *mddev);
void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim);
int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev);
void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes);
extern const struct block_device_operations md_fops;
/*
* MD devices can be used undeneath by DM, in which case ->gendisk is NULL.
*/
static inline bool mddev_is_dm(struct mddev *mddev)
{
return !mddev->gendisk;
}
static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio,
sector_t sector)
{
if (!mddev_is_dm(mddev))
trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector);
}
#define mddev_add_trace_msg(mddev, fmt, args...) \
do { \
if (!mddev_is_dm(mddev)) \
blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \
} while (0)
#endif /* _MD_MD_H */

View File

@ -379,6 +379,19 @@ static void raid0_free(struct mddev *mddev, void *priv)
free_conf(mddev, conf);
}
static int raid0_set_limits(struct mddev *mddev)
{
struct queue_limits lim;
blk_set_stacking_limits(&lim);
lim.max_hw_sectors = mddev->chunk_sectors;
lim.max_write_zeroes_sectors = mddev->chunk_sectors;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * mddev->raid_disks;
mddev_stack_rdev_limits(mddev, &lim);
return queue_limits_set(mddev->gendisk->queue, &lim);
}
static int raid0_run(struct mddev *mddev)
{
struct r0conf *conf;
@ -399,20 +412,10 @@ static int raid0_run(struct mddev *mddev)
mddev->private = conf;
}
conf = mddev->private;
if (mddev->queue) {
struct md_rdev *rdev;
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
(mddev->chunk_sectors << 9) * mddev->raid_disks);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
}
if (!mddev_is_dm(mddev)) {
ret = raid0_set_limits(mddev);
if (ret)
goto out_free_conf;
}
/* calculate array device size */
@ -426,8 +429,10 @@ static int raid0_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret)
free_conf(mddev, conf);
goto out_free_conf;
return 0;
out_free_conf:
free_conf(mddev, conf);
return ret;
}
@ -578,10 +583,7 @@ static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio)
bio_set_dev(bio, tmp_dev->bdev);
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
if (mddev->gendisk)
trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
bio_sector);
mddev_trace_remap(mddev, bio, bio_sector);
mddev_check_write_zeroes(mddev, bio);
submit_bio_noacct(bio);
}

View File

@ -227,3 +227,72 @@ static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
return false;
}
/**
* raid1_check_read_range() - check a given read range for bad blocks,
* available read length is returned;
* @rdev: the rdev to read;
* @this_sector: read position;
* @len: read length;
*
* helper function for read_balance()
*
* 1) If there are no bad blocks in the range, @len is returned;
* 2) If the range are all bad blocks, 0 is returned;
* 3) If there are partial bad blocks:
* - If the bad block range starts after @this_sector, the length of first
* good region is returned;
* - If the bad block range starts before @this_sector, 0 is returned and
* the @len is updated to the offset into the region before we get to the
* good blocks;
*/
static inline int raid1_check_read_range(struct md_rdev *rdev,
sector_t this_sector, int *len)
{
sector_t first_bad;
int bad_sectors;
/* no bad block overlap */
if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
return *len;
/*
* bad block range starts offset into our range so we can return the
* number of sectors before the bad blocks start.
*/
if (first_bad > this_sector)
return first_bad - this_sector;
/* read range is fully consumed by bad blocks. */
if (this_sector + *len <= first_bad + bad_sectors)
return 0;
/*
* final case, bad block range starts before or at the start of our
* range but does not cover our entire range so we still return 0 but
* update the length with the number of sectors before we get to the
* good ones.
*/
*len = first_bad + bad_sectors - this_sector;
return 0;
}
/*
* Check if read should choose the first rdev.
*
* Balance on the whole device if no resync is going on (recovery is ok) or
* below the resync window. Otherwise, take the first readable disk.
*/
static inline bool raid1_should_read_first(struct mddev *mddev,
sector_t this_sector, int len)
{
if ((mddev->recovery_cp < this_sector + len))
return true;
if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, READ, this_sector,
this_sector + len))
return true;
return false;
}

View File

@ -46,9 +46,6 @@
static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
#define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
#define RAID_1_10_NAME "raid1"
#include "raid1-10.c"
@ -498,9 +495,6 @@ static void raid1_end_write_request(struct bio *bio)
* to user-side. So if something waits for IO, then it
* will wait for the 'master' bio.
*/
sector_t first_bad;
int bad_sectors;
r1_bio->bios[mirror] = NULL;
to_put = bio;
/*
@ -516,8 +510,8 @@ static void raid1_end_write_request(struct bio *bio)
set_bit(R1BIO_Uptodate, &r1_bio->state);
/* Maybe we can clear some bad blocks. */
if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
&first_bad, &bad_sectors) && !discard_error) {
if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
!discard_error) {
r1_bio->bios[mirror] = IO_MADE_GOOD;
set_bit(R1BIO_MadeGood, &r1_bio->state);
}
@ -582,211 +576,312 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector,
return len;
}
/*
* This routine returns the disk from which the requested read should
* be done. There is a per-array 'next expected sequential IO' sector
* number - if this matches on the next IO then we use the last disk.
* There is also a per-disk 'last know head position' sector that is
* maintained from IRQ contexts, both the normal and the resync IO
* completion handlers update this position correctly. If there is no
* perfect sequential match then we pick the disk whose head is closest.
*
* If there are 2 mirrors in the same 2 devices, performance degrades
* because position is mirror, not device based.
*
* The rdev for the device selected will have nr_pending incremented.
*/
static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
static void update_read_sectors(struct r1conf *conf, int disk,
sector_t this_sector, int len)
{
const sector_t this_sector = r1_bio->sector;
int sectors;
int best_good_sectors;
int best_disk, best_dist_disk, best_pending_disk;
int has_nonrot_disk;
struct raid1_info *info = &conf->mirrors[disk];
atomic_inc(&info->rdev->nr_pending);
if (info->next_seq_sect != this_sector)
info->seq_start = this_sector;
info->next_seq_sect = this_sector + len;
}
static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
sector_t this_sector = r1_bio->sector;
int len = r1_bio->sectors;
int disk;
sector_t best_dist;
unsigned int min_pending;
struct md_rdev *rdev;
int choose_first;
int choose_next_idle;
/*
* Check if we can balance. We can balance on the whole
* device if no resync is going on, or below the resync window.
* We take the first readable disk when above the resync window.
*/
retry:
sectors = r1_bio->sectors;
best_disk = -1;
best_dist_disk = -1;
best_dist = MaxSector;
best_pending_disk = -1;
min_pending = UINT_MAX;
best_good_sectors = 0;
has_nonrot_disk = 0;
choose_next_idle = 0;
clear_bit(R1BIO_FailFast, &r1_bio->state);
if ((conf->mddev->recovery_cp < this_sector + sectors) ||
(mddev_is_clustered(conf->mddev) &&
md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
this_sector + sectors)))
choose_first = 1;
else
choose_first = 0;
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
sector_t dist;
sector_t first_bad;
int bad_sectors;
unsigned int pending;
bool nonrot;
struct md_rdev *rdev;
int read_len;
if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
rdev = conf->mirrors[disk].rdev;
if (r1_bio->bios[disk] == IO_BLOCKED
|| rdev == NULL
|| test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < this_sector + sectors)
continue;
if (test_bit(WriteMostly, &rdev->flags)) {
/* Don't balance among write-mostly, just
* use the first as a last resort */
if (best_dist_disk < 0) {
if (is_badblock(rdev, this_sector, sectors,
&first_bad, &bad_sectors)) {
if (first_bad <= this_sector)
/* Cannot use this */
continue;
best_good_sectors = first_bad - this_sector;
} else
best_good_sectors = sectors;
best_dist_disk = disk;
best_pending_disk = disk;
}
continue;
}
/* This is a reasonable device to use. It might
* even be best.
*/
if (is_badblock(rdev, this_sector, sectors,
&first_bad, &bad_sectors)) {
if (best_dist < MaxSector)
/* already have a better device */
continue;
if (first_bad <= this_sector) {
/* cannot read here. If this is the 'primary'
* device, then we must not read beyond
* bad_sectors from another device..
*/
bad_sectors -= (this_sector - first_bad);
if (choose_first && sectors > bad_sectors)
sectors = bad_sectors;
if (best_good_sectors > sectors)
best_good_sectors = sectors;
} else {
sector_t good_sectors = first_bad - this_sector;
if (good_sectors > best_good_sectors) {
best_good_sectors = good_sectors;
best_disk = disk;
}
if (choose_first)
break;
}
continue;
} else {
if ((sectors > best_good_sectors) && (best_disk >= 0))
best_disk = -1;
best_good_sectors = sectors;
}
if (best_disk >= 0)
/* At least two disks to choose from so failfast is OK */
set_bit(R1BIO_FailFast, &r1_bio->state);
nonrot = bdev_nonrot(rdev->bdev);
has_nonrot_disk |= nonrot;
pending = atomic_read(&rdev->nr_pending);
dist = abs(this_sector - conf->mirrors[disk].head_position);
if (choose_first) {
best_disk = disk;
break;
}
/* Don't change to another disk for sequential reads */
if (conf->mirrors[disk].next_seq_sect == this_sector
|| dist == 0) {
int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
struct raid1_info *mirror = &conf->mirrors[disk];
best_disk = disk;
/*
* If buffered sequential IO size exceeds optimal
* iosize, check if there is idle disk. If yes, choose
* the idle disk. read_balance could already choose an
* idle disk before noticing it's a sequential IO in
* this disk. This doesn't matter because this disk
* will idle, next time it will be utilized after the
* first disk has IO size exceeds optimal iosize. In
* this way, iosize of the first disk will be optimal
* iosize at least. iosize of the second disk might be
* small, but not a big deal since when the second disk
* starts IO, the first disk is likely still busy.
*/
if (nonrot && opt_iosize > 0 &&
mirror->seq_start != MaxSector &&
mirror->next_seq_sect > opt_iosize &&
mirror->next_seq_sect - opt_iosize >=
mirror->seq_start) {
choose_next_idle = 1;
continue;
}
break;
}
if (choose_next_idle)
if (!rdev || test_bit(Faulty, &rdev->flags))
continue;
if (min_pending > pending) {
min_pending = pending;
best_pending_disk = disk;
}
if (dist < best_dist) {
best_dist = dist;
best_dist_disk = disk;
/* choose the first disk even if it has some bad blocks. */
read_len = raid1_check_read_range(rdev, this_sector, &len);
if (read_len > 0) {
update_read_sectors(conf, disk, this_sector, read_len);
*max_sectors = read_len;
return disk;
}
}
return -1;
}
static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
sector_t this_sector = r1_bio->sector;
int best_disk = -1;
int best_len = 0;
int disk;
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
struct md_rdev *rdev;
int len;
int read_len;
if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
rdev = conf->mirrors[disk].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags) ||
test_bit(WriteMostly, &rdev->flags))
continue;
/* keep track of the disk with the most readable sectors. */
len = r1_bio->sectors;
read_len = raid1_check_read_range(rdev, this_sector, &len);
if (read_len > best_len) {
best_disk = disk;
best_len = read_len;
}
}
if (best_disk != -1) {
*max_sectors = best_len;
update_read_sectors(conf, best_disk, this_sector, best_len);
}
return best_disk;
}
static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
sector_t this_sector = r1_bio->sector;
int bb_disk = -1;
int bb_read_len = 0;
int disk;
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
struct md_rdev *rdev;
int len;
int read_len;
if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
rdev = conf->mirrors[disk].rdev;
if (!rdev || test_bit(Faulty, &rdev->flags) ||
!test_bit(WriteMostly, &rdev->flags))
continue;
/* there are no bad blocks, we can use this disk */
len = r1_bio->sectors;
read_len = raid1_check_read_range(rdev, this_sector, &len);
if (read_len == r1_bio->sectors) {
update_read_sectors(conf, disk, this_sector, read_len);
return disk;
}
/*
* there are partial bad blocks, choose the rdev with largest
* read length.
*/
if (read_len > bb_read_len) {
bb_disk = disk;
bb_read_len = read_len;
}
}
if (bb_disk != -1) {
*max_sectors = bb_read_len;
update_read_sectors(conf, bb_disk, this_sector, bb_read_len);
}
return bb_disk;
}
static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio)
{
/* TODO: address issues with this check and concurrency. */
return conf->mirrors[disk].next_seq_sect == r1_bio->sector ||
conf->mirrors[disk].head_position == r1_bio->sector;
}
/*
* If buffered sequential IO size exceeds optimal iosize, check if there is idle
* disk. If yes, choose the idle disk.
*/
static bool should_choose_next(struct r1conf *conf, int disk)
{
struct raid1_info *mirror = &conf->mirrors[disk];
int opt_iosize;
if (!test_bit(Nonrot, &mirror->rdev->flags))
return false;
opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9;
return opt_iosize > 0 && mirror->seq_start != MaxSector &&
mirror->next_seq_sect > opt_iosize &&
mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
}
static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
{
if (!rdev || test_bit(Faulty, &rdev->flags))
return false;
/* still in recovery */
if (!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
return false;
/* don't read from slow disk unless have to */
if (test_bit(WriteMostly, &rdev->flags))
return false;
/* don't split IO for bad blocks unless have to */
if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors))
return false;
return true;
}
struct read_balance_ctl {
sector_t closest_dist;
int closest_dist_disk;
int min_pending;
int min_pending_disk;
int sequential_disk;
int readable_disks;
};
static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
{
int disk;
struct read_balance_ctl ctl = {
.closest_dist_disk = -1,
.closest_dist = MaxSector,
.min_pending_disk = -1,
.min_pending = UINT_MAX,
.sequential_disk = -1,
};
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
struct md_rdev *rdev;
sector_t dist;
unsigned int pending;
if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
rdev = conf->mirrors[disk].rdev;
if (!rdev_readable(rdev, r1_bio))
continue;
/* At least two disks to choose from so failfast is OK */
if (ctl.readable_disks++ == 1)
set_bit(R1BIO_FailFast, &r1_bio->state);
pending = atomic_read(&rdev->nr_pending);
dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
/* Don't change to another disk for sequential reads */
if (is_sequential(conf, disk, r1_bio)) {
if (!should_choose_next(conf, disk))
return disk;
/*
* Add 'pending' to avoid choosing this disk if
* there is other idle disk.
*/
pending++;
/*
* If there is no other idle disk, this disk
* will be chosen.
*/
ctl.sequential_disk = disk;
}
if (ctl.min_pending > pending) {
ctl.min_pending = pending;
ctl.min_pending_disk = disk;
}
if (ctl.closest_dist > dist) {
ctl.closest_dist = dist;
ctl.closest_dist_disk = disk;
}
}
/*
* sequential IO size exceeds optimal iosize, however, there is no other
* idle disk, so choose the sequential disk.
*/
if (ctl.sequential_disk != -1 && ctl.min_pending != 0)
return ctl.sequential_disk;
/*
* If all disks are rotational, choose the closest disk. If any disk is
* non-rotational, choose the disk with less pending request even the
* disk is rotational, which might/might not be optimal for raids with
* mixed ratation/non-rotational disks depending on workload.
*/
if (best_disk == -1) {
if (has_nonrot_disk || min_pending == 0)
best_disk = best_pending_disk;
else
best_disk = best_dist_disk;
if (ctl.min_pending_disk != -1 &&
(READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
return ctl.min_pending_disk;
else
return ctl.closest_dist_disk;
}
/*
* This routine returns the disk from which the requested read should be done.
*
* 1) If resync is in progress, find the first usable disk and use it even if it
* has some bad blocks.
*
* 2) Now that there is no resync, loop through all disks and skipping slow
* disks and disks with bad blocks for now. Only pay attention to key disk
* choice.
*
* 3) If we've made it this far, now look for disks with bad blocks and choose
* the one with most number of sectors.
*
* 4) If we are all the way at the end, we have no choice but to use a disk even
* if it is write mostly.
*
* The rdev for the device selected will have nr_pending incremented.
*/
static int read_balance(struct r1conf *conf, struct r1bio *r1_bio,
int *max_sectors)
{
int disk;
clear_bit(R1BIO_FailFast, &r1_bio->state);
if (raid1_should_read_first(conf->mddev, r1_bio->sector,
r1_bio->sectors))
return choose_first_rdev(conf, r1_bio, max_sectors);
disk = choose_best_rdev(conf, r1_bio);
if (disk >= 0) {
*max_sectors = r1_bio->sectors;
update_read_sectors(conf, disk, r1_bio->sector,
r1_bio->sectors);
return disk;
}
if (best_disk >= 0) {
rdev = conf->mirrors[best_disk].rdev;
if (!rdev)
goto retry;
atomic_inc(&rdev->nr_pending);
sectors = best_good_sectors;
/*
* If we are here it means we didn't find a perfectly good disk so
* now spend a bit more time trying to find one with the most good
* sectors.
*/
disk = choose_bb_rdev(conf, r1_bio, max_sectors);
if (disk >= 0)
return disk;
if (conf->mirrors[best_disk].next_seq_sect != this_sector)
conf->mirrors[best_disk].seq_start = this_sector;
conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
}
*max_sectors = sectors;
return best_disk;
return choose_slow_rdev(conf, r1_bio, max_sectors);
}
static void wake_up_barrier(struct r1conf *conf)
@ -1098,7 +1193,7 @@ static void freeze_array(struct r1conf *conf, int extra)
*/
spin_lock_irq(&conf->resync_lock);
conf->array_frozen = 1;
raid1_log(conf->mddev, "wait freeze");
mddev_add_trace_msg(conf->mddev, "raid1 wait freeze");
wait_event_lock_irq_cmd(
conf->wait_barrier,
get_unqueued_pending(conf) == extra,
@ -1287,7 +1382,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
* Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind'
*/
raid1_log(mddev, "wait behind writes");
mddev_add_trace_msg(mddev, "raid1 wait behind writes");
wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0);
}
@ -1320,11 +1415,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
test_bit(R1BIO_FailFast, &r1_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
read_bio->bi_private = r1_bio;
if (mddev->gendisk)
trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
r1_bio->sector);
mddev_trace_remap(mddev, read_bio, r1_bio->sector);
submit_bio_noacct(read_bio);
}
@ -1474,7 +1565,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
bio_wouldblock_error(bio);
return;
}
raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked",
blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, bio->bi_iter.bi_sector, false);
goto retry_write;
@ -1557,10 +1649,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
if (mddev->gendisk)
trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
r1_bio->sector);
mddev_trace_remap(mddev, mbio, r1_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev;
if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
@ -1760,6 +1849,52 @@ static int raid1_spare_active(struct mddev *mddev)
return count;
}
static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
bool replacement)
{
struct raid1_info *info = conf->mirrors + disk;
if (replacement)
info += conf->raid_disks;
if (info->rdev)
return false;
if (bdev_nonrot(rdev->bdev)) {
set_bit(Nonrot, &rdev->flags);
WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
}
rdev->raid_disk = disk;
info->head_position = 0;
info->seq_start = MaxSector;
WRITE_ONCE(info->rdev, rdev);
return true;
}
static bool raid1_remove_conf(struct r1conf *conf, int disk)
{
struct raid1_info *info = conf->mirrors + disk;
struct md_rdev *rdev = info->rdev;
if (!rdev || test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending))
return false;
/* Only remove non-faulty devices if recovery is not possible. */
if (!test_bit(Faulty, &rdev->flags) &&
rdev->mddev->recovery_disabled != conf->recovery_disabled &&
rdev->mddev->degraded < conf->raid_disks)
return false;
if (test_and_clear_bit(Nonrot, &rdev->flags))
WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
WRITE_ONCE(info->rdev, NULL);
return true;
}
static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r1conf *conf = mddev->private;
@ -1791,19 +1926,16 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors + mirror;
if (!p->rdev) {
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
err = mddev_stack_new_rdev(mddev, rdev);
if (err)
return err;
p->head_position = 0;
rdev->raid_disk = mirror;
err = 0;
raid1_add_conf(conf, rdev, mirror, false);
/* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array
*/
if (rdev->saved_raid_disk < 0)
conf->fullsync = 1;
WRITE_ONCE(p->rdev, rdev);
break;
}
if (test_bit(WantReplacement, &p->rdev->flags) &&
@ -1813,13 +1945,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err && repl_slot >= 0) {
/* Add this device as a replacement */
p = conf->mirrors + repl_slot;
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = repl_slot;
raid1_add_conf(conf, rdev, repl_slot, true);
err = 0;
conf->fullsync = 1;
WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
}
print_conf(conf);
@ -1836,27 +1966,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (unlikely(number >= conf->raid_disks))
goto abort;
if (rdev != p->rdev)
p = conf->mirrors + conf->raid_disks + number;
if (rdev != p->rdev) {
number += conf->raid_disks;
p = conf->mirrors + number;
}
print_conf(conf);
if (rdev == p->rdev) {
if (test_bit(In_sync, &rdev->flags) ||
atomic_read(&rdev->nr_pending)) {
if (!raid1_remove_conf(conf, number)) {
err = -EBUSY;
goto abort;
}
/* Only remove non-faulty devices if recovery
* is not possible.
*/
if (!test_bit(Faulty, &rdev->flags) &&
mddev->recovery_disabled != conf->recovery_disabled &&
mddev->degraded < conf->raid_disks) {
err = -EBUSY;
goto abort;
}
WRITE_ONCE(p->rdev, NULL);
if (conf->mirrors[conf->raid_disks + number].rdev) {
if (number < conf->raid_disks &&
conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before
* doing this to avoid confusion.
@ -1944,8 +2067,6 @@ static void end_sync_write(struct bio *bio)
struct r1bio *r1_bio = get_resync_r1bio(bio);
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
sector_t first_bad;
int bad_sectors;
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
if (!uptodate) {
@ -1955,14 +2076,11 @@ static void end_sync_write(struct bio *bio)
set_bit(MD_RECOVERY_NEEDED, &
mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state);
} else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
&first_bad, &bad_sectors) &&
!is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
r1_bio->sector,
r1_bio->sectors,
&first_bad, &bad_sectors)
)
} else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
!rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev,
r1_bio->sector, r1_bio->sectors)) {
set_bit(R1BIO_MadeGood, &r1_bio->state);
}
put_sync_write_buf(r1_bio, uptodate);
}
@ -2279,16 +2397,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
s = PAGE_SIZE >> 9;
do {
sector_t first_bad;
int bad_sectors;
rdev = conf->mirrors[d].rdev;
if (rdev &&
(test_bit(In_sync, &rdev->flags) ||
(!test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sect + s)) &&
is_badblock(rdev, sect, s,
&first_bad, &bad_sectors) == 0) {
rdev_has_badblock(rdev, sect, s) == 0) {
atomic_inc(&rdev->nr_pending);
if (sync_page_io(rdev, sect, s<<9,
conf->tmppage, REQ_OP_READ, false))
@ -3006,23 +3120,17 @@ static struct r1conf *setup_conf(struct mddev *mddev)
err = -EINVAL;
spin_lock_init(&conf->device_lock);
conf->raid_disks = mddev->raid_disks;
rdev_for_each(rdev, mddev) {
int disk_idx = rdev->raid_disk;
if (disk_idx >= mddev->raid_disks
|| disk_idx < 0)
continue;
if (test_bit(Replacement, &rdev->flags))
disk = conf->mirrors + mddev->raid_disks + disk_idx;
else
disk = conf->mirrors + disk_idx;
if (disk->rdev)
if (disk_idx >= conf->raid_disks || disk_idx < 0)
continue;
if (!raid1_add_conf(conf, rdev, disk_idx,
test_bit(Replacement, &rdev->flags)))
goto abort;
disk->rdev = rdev;
disk->head_position = 0;
disk->seq_start = MaxSector;
}
conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list);
@ -3086,12 +3194,21 @@ static struct r1conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
static int raid1_set_limits(struct mddev *mddev)
{
struct queue_limits lim;
blk_set_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
mddev_stack_rdev_limits(mddev, &lim);
return queue_limits_set(mddev->gendisk->queue, &lim);
}
static void raid1_free(struct mddev *mddev, void *priv);
static int raid1_run(struct mddev *mddev)
{
struct r1conf *conf;
int i;
struct md_rdev *rdev;
int ret;
if (mddev->level != 1) {
@ -3118,14 +3235,10 @@ static int raid1_run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
if (mddev->queue)
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
rdev_for_each(rdev, mddev) {
if (!mddev->gendisk)
continue;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
if (!mddev_is_dm(mddev)) {
ret = raid1_set_limits(mddev);
if (ret)
goto abort;
}
mddev->degraded = 0;

View File

@ -71,6 +71,7 @@ struct r1conf {
* allow for replacements.
*/
int raid_disks;
int nonrot_disks;
spinlock_t device_lock;

View File

@ -76,9 +76,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
static void end_reshape_write(struct bio *bio);
static void end_reshape(struct r10conf *conf);
#define raid10_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
#include "raid1-10.c"
#define NULL_CMD
@ -518,11 +515,7 @@ static void raid10_end_write_request(struct bio *bio)
* The 'master' represents the composite IO operation to
* user-side. So if something waits for IO, then it will
* wait for the 'master' bio.
*/
sector_t first_bad;
int bad_sectors;
/*
*
* Do not set R10BIO_Uptodate if the current device is
* rebuilding or Faulty. This is because we cannot use
* such device for properly reading the data back (we could
@ -535,10 +528,9 @@ static void raid10_end_write_request(struct bio *bio)
set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */
if (is_badblock(rdev,
r10_bio->devs[slot].addr,
r10_bio->sectors,
&first_bad, &bad_sectors) && !discard_error) {
if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->sectors) &&
!discard_error) {
bio_put(bio);
if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
@ -753,17 +745,8 @@ static struct md_rdev *read_balance(struct r10conf *conf,
best_good_sectors = 0;
do_balance = 1;
clear_bit(R10BIO_FailFast, &r10_bio->state);
/*
* Check if we can balance. We can balance on the whole
* device if no resync is going on (recovery is ok), or below
* the resync window. We take the first readable disk when
* above the resync window.
*/
if ((conf->mddev->recovery_cp < MaxSector
&& (this_sector + sectors >= conf->next_resync)) ||
(mddev_is_clustered(conf->mddev) &&
md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
this_sector + sectors)))
if (raid1_should_read_first(conf->mddev, this_sector, sectors))
do_balance = 0;
for (slot = 0; slot < conf->copies ; slot++) {
@ -1033,7 +1016,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
ret = false;
} else {
conf->nr_waiting++;
raid10_log(conf->mddev, "wait barrier");
mddev_add_trace_msg(conf->mddev, "raid10 wait barrier");
wait_event_barrier(conf, stop_waiting_barrier(conf));
conf->nr_waiting--;
}
@ -1152,7 +1135,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
bio_wouldblock_error(bio);
return false;
}
raid10_log(conf->mddev, "wait reshape");
mddev_add_trace_msg(conf->mddev, "raid10 wait reshape");
wait_event(conf->wait_barrier,
conf->reshape_progress <= bio->bi_iter.bi_sector ||
conf->reshape_progress >= bio->bi_iter.bi_sector +
@ -1249,10 +1232,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
test_bit(R10BIO_FailFast, &r10_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
read_bio->bi_private = r10_bio;
if (mddev->gendisk)
trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
r10_bio->sector);
mddev_trace_remap(mddev, read_bio, r10_bio->sector);
submit_bio_noacct(read_bio);
return;
}
@ -1288,10 +1268,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
&& enough(conf, devnum))
mbio->bi_opf |= MD_FAILFAST;
mbio->bi_private = r10_bio;
if (conf->mddev->gendisk)
trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
r10_bio->sector);
mddev_trace_remap(mddev, mbio, r10_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev;
@ -1330,10 +1307,7 @@ retry_wait:
}
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
int bad_sectors;
int is_bad;
/*
* Discard request doesn't care the write result
@ -1342,9 +1316,8 @@ retry_wait:
if (!r10_bio->sectors)
continue;
is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
&first_bad, &bad_sectors);
if (is_bad < 0) {
if (rdev_has_badblock(rdev, dev_sector,
r10_bio->sectors) < 0) {
/*
* Mustn't write here until the bad block
* is acknowledged
@ -1360,8 +1333,9 @@ retry_wait:
if (unlikely(blocked_rdev)) {
/* Have to wait for this device to get unblocked, then retry */
allow_barrier(conf);
raid10_log(conf->mddev, "%s wait rdev %d blocked",
__func__, blocked_rdev->raid_disk);
mddev_add_trace_msg(conf->mddev,
"raid10 %s wait rdev %d blocked",
__func__, blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, false);
goto retry_wait;
@ -1416,7 +1390,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
bio_wouldblock_error(bio);
return;
}
raid10_log(conf->mddev, "wait reshape metadata");
mddev_add_trace_msg(conf->mddev,
"raid10 wait reshape metadata");
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
@ -2131,10 +2106,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
continue;
}
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
err = mddev_stack_new_rdev(mddev, rdev);
if (err)
return err;
p->head_position = 0;
p->recovery_disabled = mddev->recovery_disabled - 1;
rdev->raid_disk = mirror;
@ -2150,10 +2124,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = repl_slot;
err = 0;
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
err = mddev_stack_new_rdev(mddev, rdev);
if (err)
return err;
conf->fullsync = 1;
WRITE_ONCE(p->replacement, rdev);
}
@ -2290,8 +2263,6 @@ static void end_sync_write(struct bio *bio)
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
sector_t first_bad;
int bad_sectors;
int slot;
int repl;
struct md_rdev *rdev = NULL;
@ -2312,11 +2283,10 @@ static void end_sync_write(struct bio *bio)
&rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state);
}
} else if (is_badblock(rdev,
r10_bio->devs[slot].addr,
r10_bio->sectors,
&first_bad, &bad_sectors))
} else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
r10_bio->sectors)) {
set_bit(R10BIO_MadeGood, &r10_bio->state);
}
rdev_dec_pending(rdev, mddev);
@ -2597,11 +2567,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
int sectors, struct page *page, enum req_op op)
{
sector_t first_bad;
int bad_sectors;
if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
&& (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
if (rdev_has_badblock(rdev, sector, sectors) &&
(op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
return -1;
if (sync_page_io(rdev, sector, sectors << 9, page, op, false))
/* success */
@ -2658,16 +2625,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
s = PAGE_SIZE >> 9;
do {
sector_t first_bad;
int bad_sectors;
d = r10_bio->devs[sl].devnum;
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
&first_bad, &bad_sectors) == 0) {
rdev_has_badblock(rdev,
r10_bio->devs[sl].addr + sect,
s) == 0) {
atomic_inc(&rdev->nr_pending);
success = sync_page_io(rdev,
r10_bio->devs[sl].addr +
@ -4002,14 +3967,26 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
static void raid10_set_io_opt(struct r10conf *conf)
static unsigned int raid10_nr_stripes(struct r10conf *conf)
{
int raid_disks = conf->geo.raid_disks;
unsigned int raid_disks = conf->geo.raid_disks;
if (!(conf->geo.raid_disks % conf->geo.near_copies))
raid_disks /= conf->geo.near_copies;
blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
raid_disks);
if (conf->geo.raid_disks % conf->geo.near_copies)
return raid_disks;
return raid_disks / conf->geo.near_copies;
}
static int raid10_set_queue_limits(struct mddev *mddev)
{
struct r10conf *conf = mddev->private;
struct queue_limits lim;
blk_set_stacking_limits(&lim);
lim.max_write_zeroes_sectors = 0;
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
mddev_stack_rdev_limits(mddev, &lim);
return queue_limits_set(mddev->gendisk->queue, &lim);
}
static int raid10_run(struct mddev *mddev)
@ -4021,6 +3998,7 @@ static int raid10_run(struct mddev *mddev)
sector_t size;
sector_t min_offset_diff = 0;
int first = 1;
int ret = -EIO;
if (mddev->private == NULL) {
conf = setup_conf(mddev);
@ -4047,12 +4025,6 @@ static int raid10_run(struct mddev *mddev)
}
}
if (mddev->queue) {
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
raid10_set_io_opt(conf);
}
rdev_for_each(rdev, mddev) {
long long diff;
@ -4081,14 +4053,16 @@ static int raid10_run(struct mddev *mddev)
if (first || diff < min_offset_diff)
min_offset_diff = diff;
if (mddev->gendisk)
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk->head_position = 0;
first = 0;
}
if (!mddev_is_dm(conf->mddev)) {
ret = raid10_set_queue_limits(mddev);
if (ret)
goto out_free_conf;
}
/* need to check that every block has at least one working mirror */
if (!enough(conf, -1)) {
pr_err("md/raid10:%s: not enough operational mirrors.\n",
@ -4185,7 +4159,7 @@ out_free_conf:
raid10_free_conf(conf);
mddev->private = NULL;
out:
return -EIO;
return ret;
}
static void raid10_free(struct mddev *mddev, void *priv)
@ -4954,8 +4928,7 @@ static void end_reshape(struct r10conf *conf)
conf->reshape_safe = MaxSector;
spin_unlock_irq(&conf->device_lock);
if (conf->mddev->queue)
raid10_set_io_opt(conf);
mddev_update_io_opt(conf->mddev, raid10_nr_stripes(conf));
conf->fullsync = 0;
}

View File

@ -1393,7 +1393,8 @@ int ppl_init_log(struct r5conf *conf)
ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
ppl_conf->block_size = 512;
} else {
ppl_conf->block_size = queue_logical_block_size(mddev->queue);
ppl_conf->block_size =
queue_logical_block_size(mddev->gendisk->queue);
}
for (i = 0; i < ppl_conf->count; i++) {

View File

@ -36,6 +36,7 @@
*/
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/async_tx.h>
@ -760,6 +761,7 @@ enum stripe_result {
STRIPE_RETRY,
STRIPE_SCHEDULE_AND_RETRY,
STRIPE_FAIL,
STRIPE_WAIT_RESHAPE,
};
struct stripe_request_ctx {
@ -1210,10 +1212,8 @@ again:
*/
while (op_is_write(op) && rdev &&
test_bit(WriteErrorSeen, &rdev->flags)) {
sector_t first_bad;
int bad_sectors;
int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
int bad = rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf));
if (!bad)
break;
@ -1295,10 +1295,7 @@ again:
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
if (conf->mddev->gendisk)
trace_block_bio_remap(bi,
disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector);
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, bi);
else
@ -1342,10 +1339,7 @@ again:
*/
if (op == REQ_OP_DISCARD)
rbi->bi_vcnt = 0;
if (conf->mddev->gendisk)
trace_block_bio_remap(rbi,
disk_devt(conf->mddev->gendisk),
sh->dev[i].sector);
mddev_trace_remap(conf->mddev, rbi, sh->dev[i].sector);
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, rbi);
else
@ -2412,7 +2406,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
atomic_inc(&conf->active_stripes);
raid5_release_stripe(sh);
conf->max_nr_stripes++;
WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + 1);
return 1;
}
@ -2422,12 +2416,12 @@ static int grow_stripes(struct r5conf *conf, int num)
size_t namelen = sizeof(conf->cache_name[0]);
int devs = max(conf->raid_disks, conf->previous_raid_disks);
if (conf->mddev->gendisk)
snprintf(conf->cache_name[0], namelen,
"raid%d-%s", conf->level, mdname(conf->mddev));
else
if (mddev_is_dm(conf->mddev))
snprintf(conf->cache_name[0], namelen,
"raid%d-%p", conf->level, conf->mddev);
else
snprintf(conf->cache_name[0], namelen,
"raid%d-%s", conf->level, mdname(conf->mddev));
snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
conf->active_name = 0;
@ -2707,7 +2701,7 @@ static int drop_one_stripe(struct r5conf *conf)
shrink_buffers(sh);
free_stripe(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes);
conf->max_nr_stripes--;
WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - 1);
return 1;
}
@ -2855,8 +2849,6 @@ static void raid5_end_write_request(struct bio *bi)
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
struct md_rdev *rdev;
sector_t first_bad;
int bad_sectors;
int replacement = 0;
for (i = 0 ; i < disks; i++) {
@ -2888,9 +2880,8 @@ static void raid5_end_write_request(struct bio *bi)
if (replacement) {
if (bi->bi_status)
md_error(conf->mddev, rdev);
else if (is_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors))
else if (rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf)))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (bi->bi_status) {
@ -2900,9 +2891,8 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
} else if (is_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors)) {
} else if (rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf))) {
set_bit(R5_MadeGood, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags))
/* That was a successful write so make
@ -4205,10 +4195,9 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
if (conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue,
"raid5 rmw %llu %d",
(unsigned long long)sh->sector, rmw);
mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d",
sh->sector, rmw);
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_InJournal, &dev->flags) &&
@ -4285,10 +4274,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_DELAYED, &sh->state);
}
}
if (rcw && conf->mddev->queue)
blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector,
rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
if (rcw && !mddev_is_dm(conf->mddev))
blk_add_trace_msg(conf->mddev->gendisk->queue,
"raid5 rcw %llu %d %d %d",
(unsigned long long)sh->sector, rcw, qread,
test_bit(STRIPE_DELAYED, &sh->state));
}
if (rcw > disks && rmw > disks &&
@ -4674,8 +4664,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
/* Now to look around and see what can be done */
for (i=disks; i--; ) {
struct md_rdev *rdev;
sector_t first_bad;
int bad_sectors;
int is_bad = 0;
dev = &sh->dev[i];
@ -4719,8 +4707,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
rdev = conf->disks[i].replacement;
if (rdev && !test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
!is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors))
!rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf)))
set_bit(R5_ReadRepl, &dev->flags);
else {
if (rdev && !test_bit(Faulty, &rdev->flags))
@ -4733,8 +4721,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev) {
is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
&first_bad, &bad_sectors);
is_bad = rdev_has_badblock(rdev, sh->sector,
RAID5_STRIPE_SECTORS(conf));
if (s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags)
|| is_bad < 0)) {
@ -5463,8 +5451,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct r5conf *conf = mddev->private;
struct bio *align_bio;
struct md_rdev *rdev;
sector_t sector, end_sector, first_bad;
int bad_sectors, dd_idx;
sector_t sector, end_sector;
int dd_idx;
bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) {
@ -5493,8 +5481,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
atomic_inc(&rdev->nr_pending);
if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
&bad_sectors)) {
if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) {
rdev_dec_pending(rdev, mddev);
return 0;
}
@ -5530,9 +5517,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
spin_unlock_irq(&conf->device_lock);
}
if (mddev->gendisk)
trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
raid_bio->bi_iter.bi_sector);
mddev_trace_remap(mddev, align_bio, raid_bio->bi_iter.bi_sector);
submit_bio_noacct(align_bio);
return 1;
}
@ -5701,8 +5686,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
}
release_inactive_stripe_list(conf, cb->temp_inactive_list,
NR_STRIPE_HASH_LOCKS);
if (mddev->queue)
trace_block_unplug(mddev->queue, cnt, !from_schedule);
if (!mddev_is_dm(mddev))
trace_block_unplug(mddev->gendisk->queue, cnt, !from_schedule);
kfree(cb);
}
@ -5946,7 +5931,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock);
return STRIPE_SCHEDULE_AND_RETRY;
ret = STRIPE_SCHEDULE_AND_RETRY;
goto out;
}
}
spin_unlock_irq(&conf->device_lock);
@ -6025,6 +6011,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
out_release:
raid5_release_stripe(sh);
out:
if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) {
bi->bi_status = BLK_STS_RESOURCE;
ret = STRIPE_WAIT_RESHAPE;
pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress");
}
return ret;
}
@ -6146,7 +6138,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
if (res == STRIPE_FAIL)
if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE)
break;
if (res == STRIPE_RETRY)
@ -6184,6 +6176,11 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
if (rw == WRITE)
md_write_end(mddev);
if (res == STRIPE_WAIT_RESHAPE) {
md_free_cloned_bio(bi);
return false;
}
bio_endio(bi);
return true;
}
@ -6773,7 +6770,18 @@ static void raid5d(struct md_thread *thread)
spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock);
/*
* Waiting on MD_SB_CHANGE_PENDING below may deadlock
* seeing md_check_recovery() is needed to clear
* the flag when using mdmon.
*/
continue;
}
wait_event_lock_irq(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@ -6820,7 +6828,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
if (size <= 16 || size > 32768)
return -EINVAL;
conf->min_nr_stripes = size;
WRITE_ONCE(conf->min_nr_stripes, size);
mutex_lock(&conf->cache_size_mutex);
while (size < conf->max_nr_stripes &&
drop_one_stripe(conf))
@ -6832,7 +6840,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
if (!grow_one_stripe(conf, GFP_KERNEL)) {
conf->min_nr_stripes = conf->max_nr_stripes;
WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes);
result = -ENOMEM;
break;
}
@ -6967,10 +6975,8 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
pr_debug("md/raid: change stripe_size from %lu to %lu\n",
conf->stripe_size, new);
if (mddev->sync_thread ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->reshape_position != MaxSector ||
mddev->sysfs_active) {
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->reshape_position != MaxSector || mddev->sysfs_active) {
err = -EBUSY;
goto out_unlock;
}
@ -7084,7 +7090,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
if (!conf)
err = -ENODEV;
else if (new != conf->skip_copy) {
struct request_queue *q = mddev->queue;
struct request_queue *q = mddev->gendisk->queue;
conf->skip_copy = new;
if (new)
@ -7390,11 +7396,13 @@ static unsigned long raid5_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct r5conf *conf = shrink->private_data;
int max_stripes = READ_ONCE(conf->max_nr_stripes);
int min_stripes = READ_ONCE(conf->min_nr_stripes);
if (conf->max_nr_stripes < conf->min_nr_stripes)
if (max_stripes < min_stripes)
/* unlikely, but not impossible */
return 0;
return conf->max_nr_stripes - conf->min_nr_stripes;
return max_stripes - min_stripes;
}
static struct r5conf *setup_conf(struct mddev *mddev)
@ -7684,10 +7692,65 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
return 0;
}
static void raid5_set_io_opt(struct r5conf *conf)
static int raid5_set_limits(struct mddev *mddev)
{
blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
(conf->raid_disks - conf->max_degraded));
struct r5conf *conf = mddev->private;
struct queue_limits lim;
int data_disks, stripe;
struct md_rdev *rdev;
/*
* The read-ahead size must cover two whole stripes, which is
* 2 * (datadisks) * chunksize where 'n' is the number of raid devices.
*/
data_disks = conf->previous_raid_disks - conf->max_degraded;
/*
* We can only discard a whole stripe. It doesn't make sense to
* discard data disk but write parity disk
*/
stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
blk_set_stacking_limits(&lim);
lim.io_min = mddev->chunk_sectors << 9;
lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
lim.raid_partial_stripes_expensive = 1;
lim.discard_granularity = stripe;
lim.max_write_zeroes_sectors = 0;
mddev_stack_rdev_limits(mddev, &lim);
rdev_for_each(rdev, mddev)
queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
mddev->gendisk->disk_name);
/*
* Zeroing is required for discard, otherwise data could be lost.
*
* Consider a scenario: discard a stripe (the stripe could be
* inconsistent if discard_zeroes_data is 0); write one disk of the
* stripe (the stripe could be inconsistent again depending on which
* disks are used to calculate parity); the disk is broken; The stripe
* data of this disk is lost.
*
* We only allow DISCARD if the sysadmin has confirmed that only safe
* devices are in use by setting a module parameter. A better idea
* might be to turn DISCARD into WRITE_ZEROES requests, as that is
* required to be safe.
*/
if (!devices_handle_discard_safely ||
lim.max_discard_sectors < (stripe >> 9) ||
lim.discard_granularity < stripe)
lim.max_hw_discard_sectors = 0;
/*
* Requests require having a bitmap for each stripe.
* Limit the max sectors based on this.
*/
lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf);
/* No restrictions on the number of segments in the request */
lim.max_segments = USHRT_MAX;
return queue_limits_set(mddev->gendisk->queue, &lim);
}
static int raid5_run(struct mddev *mddev)
@ -7700,6 +7763,7 @@ static int raid5_run(struct mddev *mddev)
int i;
long long min_offset_diff = 0;
int first = 1;
int ret = -EIO;
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@ -7948,66 +8012,10 @@ static int raid5_run(struct mddev *mddev)
mdname(mddev));
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
if (mddev->queue) {
int chunk_size;
/* read-ahead size must cover two whole stripes, which
* is 2 * (datadisks) * chunksize where 'n' is the
* number of raid devices
*/
int data_disks = conf->previous_raid_disks - conf->max_degraded;
int stripe = data_disks *
((mddev->chunk_sectors << 9) / PAGE_SIZE);
chunk_size = mddev->chunk_sectors << 9;
blk_queue_io_min(mddev->queue, chunk_size);
raid5_set_io_opt(conf);
mddev->queue->limits.raid_partial_stripes_expensive = 1;
/*
* We can only discard a whole stripe. It doesn't make sense to
* discard data disk but write parity disk
*/
stripe = stripe * PAGE_SIZE;
stripe = roundup_pow_of_two(stripe);
mddev->queue->limits.discard_granularity = stripe;
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
rdev_for_each(rdev, mddev) {
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->new_data_offset << 9);
}
/*
* zeroing is required, otherwise data
* could be lost. Consider a scenario: discard a stripe
* (the stripe could be inconsistent if
* discard_zeroes_data is 0); write one disk of the
* stripe (the stripe could be inconsistent again
* depending on which disks are used to calculate
* parity); the disk is broken; The stripe data of this
* disk is lost.
*
* We only allow DISCARD if the sysadmin has confirmed that
* only safe devices are in use by setting a module parameter.
* A better idea might be to turn DISCARD into WRITE_ZEROES
* requests, as that is required to be safe.
*/
if (!devices_handle_discard_safely ||
mddev->queue->limits.max_discard_sectors < (stripe >> 9) ||
mddev->queue->limits.discard_granularity < stripe)
blk_queue_max_discard_sectors(mddev->queue, 0);
/*
* Requests require having a bitmap for each stripe.
* Limit the max sectors based on this.
*/
blk_queue_max_hw_sectors(mddev->queue,
RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
/* No restrictions on the number of segments in the request */
blk_queue_max_segments(mddev->queue, USHRT_MAX);
if (!mddev_is_dm(mddev)) {
ret = raid5_set_limits(mddev);
if (ret)
goto abort;
}
if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
@ -8020,7 +8028,7 @@ abort:
free_conf(conf);
mddev->private = NULL;
pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
return -EIO;
return ret;
}
static void raid5_free(struct mddev *mddev, void *priv)
@ -8531,8 +8539,8 @@ static void end_reshape(struct r5conf *conf)
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
if (conf->mddev->queue)
raid5_set_io_opt(conf);
mddev_update_io_opt(conf->mddev,
conf->raid_disks - conf->max_degraded);
}
}
@ -8909,6 +8917,18 @@ static int raid5_start(struct mddev *mddev)
return r5l_start(conf->log);
}
/*
* This is only used for dm-raid456, caller already frozen sync_thread, hence
* if rehsape is still in progress, io that is waiting for reshape can never be
* done now, hence wake up and handle those IO.
*/
static void raid5_prepare_suspend(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
wake_up(&conf->wait_for_overlap);
}
static struct md_personality raid6_personality =
{
.name = "raid6",
@ -8932,6 +8952,7 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
};
static struct md_personality raid5_personality =
{
@ -8956,6 +8977,7 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
};
static struct md_personality raid4_personality =
@ -8981,6 +9003,7 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
.prepare_suspend = raid5_prepare_suspend,
};
static int __init raid5_init(void)

View File

@ -2078,6 +2078,12 @@ static const struct blk_mq_ops msb_mq_ops = {
static int msb_init_disk(struct memstick_dev *card)
{
struct msb_data *msb = memstick_get_drvdata(card);
struct queue_limits lim = {
.logical_block_size = msb->page_size,
.max_hw_sectors = MS_BLOCK_MAX_PAGES,
.max_segments = MS_BLOCK_MAX_SEGS,
.max_segment_size = MS_BLOCK_MAX_PAGES * msb->page_size,
};
int rc;
unsigned long capacity;
@ -2093,19 +2099,13 @@ static int msb_init_disk(struct memstick_dev *card)
if (rc)
goto out_release_id;
msb->disk = blk_mq_alloc_disk(&msb->tag_set, card);
msb->disk = blk_mq_alloc_disk(&msb->tag_set, &lim, card);
if (IS_ERR(msb->disk)) {
rc = PTR_ERR(msb->disk);
goto out_free_tag_set;
}
msb->queue = msb->disk->queue;
blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES);
blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS);
blk_queue_max_segment_size(msb->queue,
MS_BLOCK_MAX_PAGES * msb->page_size);
blk_queue_logical_block_size(msb->queue, msb->page_size);
sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id);
msb->disk->fops = &msb_bdops;
msb->disk->private_data = msb;

View File

@ -1103,6 +1103,12 @@ static const struct blk_mq_ops mspro_mq_ops = {
static int mspro_block_init_disk(struct memstick_dev *card)
{
struct mspro_block_data *msb = memstick_get_drvdata(card);
struct queue_limits lim = {
.logical_block_size = msb->page_size,
.max_hw_sectors = MSPRO_BLOCK_MAX_PAGES,
.max_segments = MSPRO_BLOCK_MAX_SEGS,
.max_segment_size = MSPRO_BLOCK_MAX_PAGES * msb->page_size,
};
struct mspro_devinfo *dev_info = NULL;
struct mspro_sys_info *sys_info = NULL;
struct mspro_sys_attr *s_attr = NULL;
@ -1138,18 +1144,13 @@ static int mspro_block_init_disk(struct memstick_dev *card)
if (rc)
goto out_release_id;
msb->disk = blk_mq_alloc_disk(&msb->tag_set, card);
msb->disk = blk_mq_alloc_disk(&msb->tag_set, &lim, card);
if (IS_ERR(msb->disk)) {
rc = PTR_ERR(msb->disk);
goto out_free_tag_set;
}
msb->queue = msb->disk->queue;
blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES);
blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS);
blk_queue_max_segment_size(msb->queue,
MSPRO_BLOCK_MAX_PAGES * msb->page_size);
msb->disk->major = major;
msb->disk->first_minor = disk_id << MSPRO_BLOCK_PART_SHIFT;
msb->disk->minors = 1 << MSPRO_BLOCK_PART_SHIFT;
@ -1158,8 +1159,6 @@ static int mspro_block_init_disk(struct memstick_dev *card)
sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
blk_queue_logical_block_size(msb->queue, msb->page_size);
capacity = be16_to_cpu(sys_info->user_block_count);
capacity *= be16_to_cpu(sys_info->block_size);
capacity *= msb->page_size >> 9;

View File

@ -174,8 +174,8 @@ static struct scatterlist *mmc_alloc_sg(unsigned short sg_len, gfp_t gfp)
return sg;
}
static void mmc_queue_setup_discard(struct request_queue *q,
struct mmc_card *card)
static void mmc_queue_setup_discard(struct mmc_card *card,
struct queue_limits *lim)
{
unsigned max_discard;
@ -183,15 +183,17 @@ static void mmc_queue_setup_discard(struct request_queue *q,
if (!max_discard)
return;
blk_queue_max_discard_sectors(q, max_discard);
q->limits.discard_granularity = card->pref_erase << 9;
lim->max_hw_discard_sectors = max_discard;
if (mmc_can_secure_erase_trim(card))
lim->max_secure_erase_sectors = max_discard;
if (mmc_can_trim(card) && card->erased_byte == 0)
lim->max_write_zeroes_sectors = max_discard;
/* granularity must not be greater than max. discard */
if (card->pref_erase > max_discard)
q->limits.discard_granularity = SECTOR_SIZE;
if (mmc_can_secure_erase_trim(card))
blk_queue_max_secure_erase_sectors(q, max_discard);
if (mmc_can_trim(card) && card->erased_byte == 0)
blk_queue_max_write_zeroes_sectors(q, max_discard);
lim->discard_granularity = SECTOR_SIZE;
else
lim->discard_granularity = card->pref_erase << 9;
}
static unsigned short mmc_get_max_segments(struct mmc_host *host)
@ -341,40 +343,53 @@ static const struct blk_mq_ops mmc_mq_ops = {
.timeout = mmc_mq_timed_out,
};
static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq,
struct mmc_card *card)
{
struct mmc_host *host = card->host;
unsigned block_size = 512;
struct queue_limits lim = { };
struct gendisk *disk;
if (mmc_can_erase(card))
mmc_queue_setup_discard(card, &lim);
if (!mmc_dev(host)->dma_mask || !*mmc_dev(host)->dma_mask)
lim.bounce = BLK_BOUNCE_HIGH;
lim.max_hw_sectors = min(host->max_blk_count, host->max_req_size / 512);
if (mmc_card_mmc(card) && card->ext_csd.data_sector_size)
lim.logical_block_size = card->ext_csd.data_sector_size;
else
lim.logical_block_size = 512;
WARN_ON_ONCE(lim.logical_block_size != 512 &&
lim.logical_block_size != 4096);
/*
* Setting a virt_boundary implicity sets a max_segment_size, so try
* to set the hardware one here.
*/
if (host->can_dma_map_merge) {
lim.virt_boundary_mask = dma_get_merge_boundary(mmc_dev(host));
lim.max_segments = MMC_DMA_MAP_MERGE_SEGMENTS;
} else {
lim.max_segment_size =
round_down(host->max_seg_size, lim.logical_block_size);
lim.max_segments = host->max_segs;
}
disk = blk_mq_alloc_disk(&mq->tag_set, &lim, mq);
if (IS_ERR(disk))
return disk;
mq->queue = disk->queue;
if (mmc_host_is_spi(host) && host->use_spi_crc)
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue);
blk_queue_rq_timeout(mq->queue, 60 * HZ);
blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue);
if (mmc_can_erase(card))
mmc_queue_setup_discard(mq->queue, card);
if (!mmc_dev(host)->dma_mask || !*mmc_dev(host)->dma_mask)
blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_HIGH);
blk_queue_max_hw_sectors(mq->queue,
min(host->max_blk_count, host->max_req_size / 512));
if (host->can_dma_map_merge)
WARN(!blk_queue_can_use_dma_map_merging(mq->queue,
mmc_dev(host)),
"merging was advertised but not possible");
blk_queue_max_segments(mq->queue, mmc_get_max_segments(host));
if (mmc_card_mmc(card) && card->ext_csd.data_sector_size) {
block_size = card->ext_csd.data_sector_size;
WARN_ON(block_size != 512 && block_size != 4096);
}
blk_queue_logical_block_size(mq->queue, block_size);
/*
* After blk_queue_can_use_dma_map_merging() was called with succeed,
* since it calls blk_queue_virt_boundary(), the mmc should not call
* both blk_queue_max_segment_size().
*/
if (!host->can_dma_map_merge)
blk_queue_max_segment_size(mq->queue,
round_down(host->max_seg_size, block_size));
dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue));
@ -386,6 +401,7 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
init_waitqueue_head(&mq->wait);
mmc_crypto_setup_queue(mq->queue, host);
return disk;
}
static inline bool mmc_merge_capable(struct mmc_host *host)
@ -447,18 +463,9 @@ struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
return ERR_PTR(ret);
disk = blk_mq_alloc_disk(&mq->tag_set, mq);
if (IS_ERR(disk)) {
disk = mmc_alloc_disk(mq, card);
if (IS_ERR(disk))
blk_mq_free_tag_set(&mq->tag_set);
return disk;
}
mq->queue = disk->queue;
if (mmc_host_is_spi(host) && host->use_spi_crc)
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue);
blk_queue_rq_timeout(mq->queue, 60 * HZ);
mmc_setup_queue(mq, card);
return disk;
}

View File

@ -277,6 +277,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
{
struct mtd_blktrans_ops *tr = new->tr;
struct mtd_blktrans_dev *d;
struct queue_limits lim = { };
int last_devnum = -1;
struct gendisk *gd;
int ret;
@ -331,9 +332,13 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING);
if (ret)
goto out_kfree_tag_set;
lim.logical_block_size = tr->blksize;
if (tr->discard)
lim.max_hw_discard_sectors = UINT_MAX;
/* Create gendisk */
gd = blk_mq_alloc_disk(new->tag_set, new);
gd = blk_mq_alloc_disk(new->tag_set, &lim, new);
if (IS_ERR(gd)) {
ret = PTR_ERR(gd);
goto out_free_tag_set;
@ -371,14 +376,9 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
if (tr->flush)
blk_queue_write_cache(new->rq, true, false);
blk_queue_logical_block_size(new->rq, tr->blksize);
blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq);
if (tr->discard)
blk_queue_max_discard_sectors(new->rq, UINT_MAX);
gd->queue = new->rq;
if (new->readonly)

View File

@ -348,6 +348,9 @@ static int calc_disk_capacity(struct ubi_volume_info *vi, u64 *disk_capacity)
int ubiblock_create(struct ubi_volume_info *vi)
{
struct queue_limits lim = {
.max_segments = UBI_MAX_SG_COUNT,
};
struct ubiblock *dev;
struct gendisk *gd;
u64 disk_capacity;
@ -393,7 +396,7 @@ int ubiblock_create(struct ubi_volume_info *vi)
/* Initialize the gendisk of this ubiblock device */
gd = blk_mq_alloc_disk(&dev->tag_set, dev);
gd = blk_mq_alloc_disk(&dev->tag_set, &lim, dev);
if (IS_ERR(gd)) {
ret = PTR_ERR(gd);
goto out_free_tags;
@ -416,7 +419,6 @@ int ubiblock_create(struct ubi_volume_info *vi)
dev->gd = gd;
dev->rq = gd->queue;
blk_queue_max_segments(dev->rq, UBI_MAX_SG_COUNT);
list_add_tail(&dev->list, &ubiblock_devices);

View File

@ -1496,19 +1496,21 @@ static int btt_blk_init(struct btt *btt)
{
struct nd_btt *nd_btt = btt->nd_btt;
struct nd_namespace_common *ndns = nd_btt->ndns;
int rc = -ENOMEM;
struct queue_limits lim = {
.logical_block_size = btt->sector_size,
.max_hw_sectors = UINT_MAX,
};
int rc;
btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE);
if (!btt->btt_disk)
return -ENOMEM;
btt->btt_disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
if (IS_ERR(btt->btt_disk))
return PTR_ERR(btt->btt_disk);
nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
btt->btt_disk->first_minor = 0;
btt->btt_disk->fops = &btt_fops;
btt->btt_disk->private_data = btt;
blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size);
blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue);

View File

@ -451,6 +451,11 @@ static int pmem_attach_disk(struct device *dev,
{
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
struct nd_region *nd_region = to_nd_region(dev->parent);
struct queue_limits lim = {
.logical_block_size = pmem_sector_size(ndns),
.physical_block_size = PAGE_SIZE,
.max_hw_sectors = UINT_MAX,
};
int nid = dev_to_node(dev), fua;
struct resource *res = &nsio->res;
struct range bb_range;
@ -497,9 +502,9 @@ static int pmem_attach_disk(struct device *dev,
return -EBUSY;
}
disk = blk_alloc_disk(nid);
if (!disk)
return -ENOMEM;
disk = blk_alloc_disk(&lim, nid);
if (IS_ERR(disk))
return PTR_ERR(disk);
q = disk->queue;
pmem->disk = disk;
@ -539,9 +544,6 @@ static int pmem_attach_disk(struct device *dev,
pmem->virt_addr = addr;
blk_queue_write_cache(q, true, fua);
blk_queue_physical_block_size(q, PAGE_SIZE);
blk_queue_logical_block_size(q, pmem_sector_size(ndns));
blk_queue_max_hw_sectors(q, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q);
if (pmem->pfn_flags & PFN_MAP)

View File

@ -1516,7 +1516,7 @@ static int apple_nvme_probe(struct platform_device *pdev)
goto put_dev;
}
anv->ctrl.admin_q = blk_mq_init_queue(&anv->admin_tagset);
anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL);
if (IS_ERR(anv->ctrl.admin_q)) {
ret = -ENOMEM;
goto put_dev;

View File

@ -114,12 +114,21 @@ static DEFINE_MUTEX(nvme_subsystems_lock);
static DEFINE_IDA(nvme_instance_ida);
static dev_t nvme_ctrl_base_chr_devt;
static struct class *nvme_class;
static struct class *nvme_subsys_class;
static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
static const struct class nvme_class = {
.name = "nvme",
.dev_uevent = nvme_class_uevent,
};
static const struct class nvme_subsys_class = {
.name = "nvme-subsystem",
};
static DEFINE_IDA(nvme_ns_chr_minor_ida);
static dev_t nvme_ns_chr_devt;
static struct class *nvme_ns_chr_class;
static const struct class nvme_ns_chr_class = {
.name = "nvme-generic",
};
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@ -1398,8 +1407,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
sizeof(struct nvme_id_ctrl));
if (error)
if (error) {
kfree(*id);
*id = NULL;
}
return error;
}
@ -1528,6 +1539,7 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (error) {
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
kfree(*id);
*id = NULL;
}
return error;
}
@ -1727,12 +1739,23 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
static void nvme_init_integrity(struct gendisk *disk,
struct nvme_ns_head *head, u32 max_integrity_segments)
static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head)
{
struct blk_integrity integrity = { };
blk_integrity_unregister(disk);
if (!head->ms)
return true;
/*
* PI can always be supported as we can ask the controller to simply
* insert/strip it, which is not possible for other kinds of metadata.
*/
if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
!(head->features & NVME_NS_METADATA_SUPPORTED))
return nvme_ns_has_pi(head);
switch (head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
switch (head->guard_type) {
@ -1775,53 +1798,32 @@ static void nvme_init_integrity(struct gendisk *disk,
}
integrity.tuple_size = head->ms;
integrity.pi_offset = head->pi_offset;
blk_integrity_register(disk, &integrity);
blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
return true;
}
#else
static void nvme_init_integrity(struct gendisk *disk,
struct nvme_ns_head *head, u32 max_integrity_segments)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
struct nvme_ns_head *head)
static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
{
struct request_queue *queue = disk->queue;
u32 max_discard_sectors;
if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) {
max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl);
} else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
max_discard_sectors = UINT_MAX;
} else {
blk_queue_max_discard_sectors(queue, 0);
return;
}
struct nvme_ctrl *ctrl = ns->ctrl;
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
/*
* If discard is already enabled, don't reset queue limits.
*
* This works around the fact that the block layer can't cope well with
* updating the hardware limits when overridden through sysfs. This is
* harmless because discard limits in NVMe are purely advisory.
*/
if (queue->limits.max_discard_sectors)
return;
blk_queue_max_discard_sectors(queue, max_discard_sectors);
if (ctrl->dmrl)
blk_queue_max_discard_segments(queue, ctrl->dmrl);
if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
lim->max_hw_discard_sectors =
nvme_lba_to_sect(ns->head, ctrl->dmrsl);
else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
lim->max_hw_discard_sectors = UINT_MAX;
else
blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
queue->limits.discard_granularity = queue_logical_block_size(queue);
lim->max_hw_discard_sectors = 0;
if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
lim->discard_granularity = lim->logical_block_size;
if (ctrl->dmrl)
lim->max_discard_segments = ctrl->dmrl;
else
lim->max_discard_segments = NVME_DSM_MAX_RANGES;
}
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
@ -1832,42 +1834,38 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
a->csi == b->csi;
}
static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
struct nvme_id_ns *id)
static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
struct nvme_id_ns_nvm **nvmp)
{
bool first = id->dps & NVME_NS_DPS_PI_FIRST;
unsigned lbaf = nvme_lbaf_index(id->flbas);
struct nvme_command c = { };
struct nvme_command c = {
.identify.opcode = nvme_admin_identify,
.identify.nsid = cpu_to_le32(nsid),
.identify.cns = NVME_ID_CNS_CS_NS,
.identify.csi = NVME_CSI_NVM,
};
struct nvme_id_ns_nvm *nvm;
int ret = 0;
u32 elbaf;
head->pi_size = 0;
head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
head->pi_size = sizeof(struct t10_pi_tuple);
head->guard_type = NVME_NVM_NS_16B_GUARD;
goto set_pi;
}
int ret;
nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
if (!nvm)
return -ENOMEM;
c.identify.opcode = nvme_admin_identify;
c.identify.nsid = cpu_to_le32(head->ns_id);
c.identify.cns = NVME_ID_CNS_CS_NS;
c.identify.csi = NVME_CSI_NVM;
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
if (ret)
goto free_data;
kfree(nvm);
else
*nvmp = nvm;
return ret;
}
elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
{
u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
/* no support for storage tag formats right now */
if (nvme_elbaf_sts(elbaf))
goto free_data;
return;
head->guard_type = nvme_elbaf_guard_type(elbaf);
switch (head->guard_type) {
@ -1880,30 +1878,31 @@ static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
default:
break;
}
free_data:
kfree(nvm);
set_pi:
if (head->pi_size && (first || head->ms == head->pi_size))
head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
else
head->pi_type = 0;
return ret;
}
static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
struct nvme_ns_head *head, struct nvme_id_ns *id)
static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
struct nvme_ns_head *head, struct nvme_id_ns *id,
struct nvme_id_ns_nvm *nvm)
{
int ret;
ret = nvme_init_ms(ctrl, head, id);
if (ret)
return ret;
head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
head->pi_type = 0;
head->pi_size = 0;
head->pi_offset = 0;
head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
return 0;
return;
if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
nvme_configure_pi_elbas(head, id, nvm);
} else {
head->pi_size = sizeof(struct t10_pi_tuple);
head->guard_type = NVME_NVM_NS_16B_GUARD;
}
if (head->pi_size && head->ms >= head->pi_size)
head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
if (!(id->dps & NVME_NS_DPS_PI_FIRST))
head->pi_offset = head->ms - head->pi_size;
if (ctrl->ops->flags & NVME_F_FABRICS) {
/*
@ -1912,7 +1911,7 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
* remap the separate metadata buffer from the block layer.
*/
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
return 0;
return;
head->features |= NVME_NS_EXT_LBAS;
@ -1939,33 +1938,32 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
else
head->features |= NVME_NS_METADATA_SUPPORTED;
}
return 0;
}
static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
struct request_queue *q)
static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
{
bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
if (ctrl->max_hw_sectors) {
u32 max_segments =
(ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
max_segments = min_not_zero(max_segments, ctrl->max_segments);
blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
}
blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
blk_queue_dma_alignment(q, 3);
blk_queue_write_cache(q, vwc, vwc);
return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
}
static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
struct nvme_ns_head *head, struct nvme_id_ns *id)
static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
struct queue_limits *lim)
{
sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze));
lim->max_hw_sectors = ctrl->max_hw_sectors;
lim->max_segments = min_t(u32, USHRT_MAX,
min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
lim->max_integrity_segments = ctrl->max_integrity_segments;
lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
lim->max_segment_size = UINT_MAX;
lim->dma_alignment = 3;
}
static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
struct queue_limits *lim)
{
struct nvme_ns_head *head = ns->head;
u32 bs = 1U << head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
bool valid = true;
/*
* The block layer can't support LBA sizes larger than the page size
@ -1973,12 +1971,10 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
* allow block I/O.
*/
if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
capacity = 0;
bs = (1 << 9);
valid = false;
}
blk_integrity_unregister(disk);
atomic_bs = phys_bs = bs;
if (id->nabo == 0) {
/*
@ -1989,7 +1985,7 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
atomic_bs = (1 + ctrl->subsys->awupf) * bs;
atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
}
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
@ -1999,36 +1995,20 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
io_opt = bs * (1 + le16_to_cpu(id->nows));
}
blk_queue_logical_block_size(disk->queue, bs);
/*
* Linux filesystems assume writing a single physical block is
* an atomic operation. Hence limit the physical block size to the
* value of the Atomic Write Unit Power Fail parameter.
*/
blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
blk_queue_io_min(disk->queue, phys_bs);
blk_queue_io_opt(disk->queue, io_opt);
/*
* Register a metadata profile for PI, or the plain non-integrity NVMe
* metadata masquerading as Type 0 if supported, otherwise reject block
* I/O to namespaces with metadata except when the namespace supports
* PI, as it can strip/insert in that case.
*/
if (head->ms) {
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
(head->features & NVME_NS_METADATA_SUPPORTED))
nvme_init_integrity(disk, head,
ctrl->max_integrity_segments);
else if (!nvme_ns_has_pi(head))
capacity = 0;
}
set_capacity_and_notify(disk, capacity);
nvme_config_discard(ctrl, disk, head);
blk_queue_max_write_zeroes_sectors(disk->queue,
ctrl->max_zeroes_sectors);
lim->logical_block_size = bs;
lim->physical_block_size = min(phys_bs, atomic_bs);
lim->io_min = phys_bs;
lim->io_opt = io_opt;
if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
lim->max_write_zeroes_sectors = UINT_MAX;
else
lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
return valid;
}
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
@ -2042,7 +2022,8 @@ static inline bool nvme_first_scan(struct gendisk *disk)
return !disk_live(disk);
}
static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
struct queue_limits *lim)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u32 iob;
@ -2070,38 +2051,36 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
return;
}
blk_queue_chunk_sectors(ns->queue, iob);
lim->chunk_sectors = iob;
}
static int nvme_update_ns_info_generic(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
struct queue_limits lim;
int ret;
blk_mq_freeze_queue(ns->disk->queue);
nvme_set_queue_limits(ns->ctrl, ns->queue);
lim = queue_limits_start_update(ns->disk->queue);
nvme_set_ctrl_limits(ns->ctrl, &lim);
ret = queue_limits_commit_update(ns->disk->queue, &lim);
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
blk_mq_unfreeze_queue(ns->disk->queue);
if (nvme_ns_head_multipath(ns->head)) {
blk_mq_freeze_queue(ns->head->disk->queue);
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
blk_stack_limits(&ns->head->disk->queue->limits,
&ns->queue->limits, 0);
ns->head->disk->flags |= GENHD_FL_HIDDEN;
blk_mq_unfreeze_queue(ns->head->disk->queue);
}
/* Hide the block-interface for these devices */
ns->disk->flags |= GENHD_FL_HIDDEN;
set_bit(NVME_NS_READY, &ns->flags);
return 0;
if (!ret)
ret = -ENODEV;
return ret;
}
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT;
struct queue_limits lim;
struct nvme_id_ns_nvm *nvm = NULL;
struct nvme_id_ns *id;
sector_t capacity;
unsigned lbaf;
int ret;
@ -2113,30 +2092,52 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
/* namespace not allocated or attached */
info->is_removed = true;
ret = -ENODEV;
goto error;
goto out;
}
if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
if (ret < 0)
goto out;
}
blk_mq_freeze_queue(ns->disk->queue);
lbaf = nvme_lbaf_index(id->flbas);
ns->head->lba_shift = id->lbaf[lbaf].ds;
ns->head->nuse = le64_to_cpu(id->nuse);
nvme_set_queue_limits(ns->ctrl, ns->queue);
capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
ret = nvme_configure_metadata(ns->ctrl, ns->head, id);
if (ret < 0) {
blk_mq_unfreeze_queue(ns->disk->queue);
goto out;
}
nvme_set_chunk_sectors(ns, id);
nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id);
if (ns->head->ids.csi == NVME_CSI_ZNS) {
ret = nvme_update_zone_info(ns, lbaf);
lim = queue_limits_start_update(ns->disk->queue);
nvme_set_ctrl_limits(ns->ctrl, &lim);
nvme_configure_metadata(ns->ctrl, ns->head, id, nvm);
nvme_set_chunk_sectors(ns, id, &lim);
if (!nvme_update_disk_info(ns, id, &lim))
capacity = 0;
nvme_config_discard(ns, &lim);
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
ns->head->ids.csi == NVME_CSI_ZNS) {
ret = nvme_update_zone_info(ns, lbaf, &lim);
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue);
goto out;
}
}
ret = queue_limits_commit_update(ns->disk->queue, &lim);
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue);
goto out;
}
/*
* Register a metadata profile for PI, or the plain non-integrity NVMe
* metadata masquerading as Type 0 if supported, otherwise reject block
* I/O to namespaces with metadata except when the namespace supports
* PI, as it can strip/insert in that case.
*/
if (!nvme_init_integrity(ns->disk, ns->head))
capacity = 0;
set_capacity_and_notify(ns->disk, capacity);
/*
* Only set the DEAC bit if the device guarantees that reads from
@ -2147,28 +2148,50 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
ns->head->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
blk_queue_write_cache(ns->disk->queue, vwc, vwc);
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) {
ret = nvme_revalidate_zones(ns);
ret = blk_revalidate_disk_zones(ns->disk, NULL);
if (ret && !nvme_first_scan(ns->disk))
goto out;
}
if (nvme_ns_head_multipath(ns->head)) {
blk_mq_freeze_queue(ns->head->disk->queue);
nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id);
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
blk_stack_limits(&ns->head->disk->queue->limits,
&ns->queue->limits, 0);
disk_update_readahead(ns->head->disk);
blk_mq_unfreeze_queue(ns->head->disk->queue);
}
ret = 0;
out:
kfree(nvm);
kfree(id);
return ret;
}
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
bool unsupported = false;
int ret;
switch (info->ids.csi) {
case NVME_CSI_ZNS:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_info(ns->ctrl->device,
"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
info->nsid);
ret = nvme_update_ns_info_generic(ns, info);
break;
}
ret = nvme_update_ns_info_block(ns, info);
break;
case NVME_CSI_NVM:
ret = nvme_update_ns_info_block(ns, info);
break;
default:
dev_info(ns->ctrl->device,
"block device for nsid %u not supported (csi %u)\n",
info->nsid, info->ids.csi);
ret = nvme_update_ns_info_generic(ns, info);
break;
}
/*
* If probing fails due an unsupported feature, hide the block device,
* but still allow other access.
@ -2176,33 +2199,30 @@ out:
if (ret == -ENODEV) {
ns->disk->flags |= GENHD_FL_HIDDEN;
set_bit(NVME_NS_READY, &ns->flags);
unsupported = true;
ret = 0;
}
error:
kfree(id);
return ret;
}
if (!ret && nvme_ns_head_multipath(ns->head)) {
struct queue_limits lim;
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
switch (info->ids.csi) {
case NVME_CSI_ZNS:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_info(ns->ctrl->device,
"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
info->nsid);
return nvme_update_ns_info_generic(ns, info);
}
return nvme_update_ns_info_block(ns, info);
case NVME_CSI_NVM:
return nvme_update_ns_info_block(ns, info);
default:
dev_info(ns->ctrl->device,
"block device for nsid %u not supported (csi %u)\n",
info->nsid, info->ids.csi);
return nvme_update_ns_info_generic(ns, info);
blk_mq_freeze_queue(ns->head->disk->queue);
if (unsupported)
ns->head->disk->flags |= GENHD_FL_HIDDEN;
else
nvme_init_integrity(ns->head->disk, ns->head);
set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
nvme_mpath_revalidate_paths(ns);
lim = queue_limits_start_update(ns->head->disk->queue);
queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
ns->head->disk->disk_name);
ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
blk_mq_unfreeze_queue(ns->head->disk->queue);
}
return ret;
}
#ifdef CONFIG_BLK_SED_OPAL
@ -2877,7 +2897,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
subsys->awupf = le16_to_cpu(id->awupf);
nvme_mpath_default_iopolicy(subsys);
subsys->dev.class = nvme_subsys_class;
subsys->dev.class = &nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem;
subsys->dev.groups = nvme_subsys_attrs_groups;
dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
@ -3117,11 +3137,17 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct
return -EINVAL;
}
if (!ctrl->maxcmd) {
dev_err(ctrl->device, "Maximum outstanding commands is 0\n");
return -EINVAL;
}
return 0;
}
static int nvme_init_identify(struct nvme_ctrl *ctrl)
{
struct queue_limits lim;
struct nvme_id_ctrl *id;
u32 max_hw_sectors;
bool prev_apst_enabled;
@ -3188,7 +3214,12 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
nvme_set_queue_limits(ctrl, ctrl->admin_q);
lim = queue_limits_start_update(ctrl->admin_q);
nvme_set_ctrl_limits(ctrl, &lim);
ret = queue_limits_commit_update(ctrl->admin_q, &lim);
if (ret)
goto out_free;
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
@ -3420,7 +3451,7 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
if (minor < 0)
return minor;
cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
cdev_device->class = nvme_ns_chr_class;
cdev_device->class = &nvme_ns_chr_class;
cdev_device->release = nvme_cdev_rel;
device_initialize(cdev_device);
cdev_init(cdev, fops);
@ -3692,7 +3723,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (!ns)
return;
disk = blk_mq_alloc_disk(ctrl->tagset, ns);
disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns);
if (IS_ERR(disk))
goto out_free_ns;
disk->fops = &nvme_bdev_ops;
@ -4353,6 +4384,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int cmd_size)
{
struct queue_limits lim = {};
int ret;
memset(set, 0, sizeof(*set));
@ -4372,14 +4404,14 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ret)
return ret;
ctrl->admin_q = blk_mq_init_queue(set);
ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL);
if (IS_ERR(ctrl->admin_q)) {
ret = PTR_ERR(ctrl->admin_q);
goto out_free_tagset;
}
if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->fabrics_q = blk_mq_init_queue(set);
ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->fabrics_q)) {
ret = PTR_ERR(ctrl->fabrics_q);
goto out_cleanup_admin_q;
@ -4443,7 +4475,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
return ret;
if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->connect_q = blk_mq_init_queue(set);
ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
@ -4613,7 +4645,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->device = &ctrl->ctrl_device;
ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
ctrl->instance);
ctrl->device->class = nvme_class;
ctrl->device->class = &nvme_class;
ctrl->device->parent = ctrl->dev;
if (ops->dev_attr_groups)
ctrl->device->groups = ops->dev_attr_groups;
@ -4846,42 +4878,36 @@ static int __init nvme_core_init(void)
if (result < 0)
goto destroy_delete_wq;
nvme_class = class_create("nvme");
if (IS_ERR(nvme_class)) {
result = PTR_ERR(nvme_class);
result = class_register(&nvme_class);
if (result)
goto unregister_chrdev;
}
nvme_class->dev_uevent = nvme_class_uevent;
nvme_subsys_class = class_create("nvme-subsystem");
if (IS_ERR(nvme_subsys_class)) {
result = PTR_ERR(nvme_subsys_class);
result = class_register(&nvme_subsys_class);
if (result)
goto destroy_class;
}
result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
"nvme-generic");
if (result < 0)
goto destroy_subsys_class;
nvme_ns_chr_class = class_create("nvme-generic");
if (IS_ERR(nvme_ns_chr_class)) {
result = PTR_ERR(nvme_ns_chr_class);
result = class_register(&nvme_ns_chr_class);
if (result)
goto unregister_generic_ns;
}
result = nvme_init_auth();
if (result)
goto destroy_ns_chr;
return 0;
destroy_ns_chr:
class_destroy(nvme_ns_chr_class);
class_unregister(&nvme_ns_chr_class);
unregister_generic_ns:
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
destroy_subsys_class:
class_destroy(nvme_subsys_class);
class_unregister(&nvme_subsys_class);
destroy_class:
class_destroy(nvme_class);
class_unregister(&nvme_class);
unregister_chrdev:
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_delete_wq:
@ -4897,9 +4923,9 @@ out:
static void __exit nvme_core_exit(void)
{
nvme_exit_auth();
class_destroy(nvme_ns_chr_class);
class_destroy(nvme_subsys_class);
class_destroy(nvme_class);
class_unregister(&nvme_ns_chr_class);
class_unregister(&nvme_subsys_class);
class_unregister(&nvme_class);
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_workqueue(nvme_delete_wq);

View File

@ -638,7 +638,7 @@ static struct key *nvmf_parse_key(int key_id)
}
key = key_lookup(key_id);
if (!IS_ERR(key))
if (IS_ERR(key))
pr_err("key id %08x not found\n", key_id);
else
pr_debug("Using key id %08x\n", key_id);
@ -1319,7 +1319,10 @@ out_free_opts:
return ERR_PTR(ret);
}
static struct class *nvmf_class;
static const struct class nvmf_class = {
.name = "nvme-fabrics",
};
static struct device *nvmf_device;
static DEFINE_MUTEX(nvmf_dev_mutex);
@ -1439,15 +1442,14 @@ static int __init nvmf_init(void)
if (!nvmf_default_host)
return -ENOMEM;
nvmf_class = class_create("nvme-fabrics");
if (IS_ERR(nvmf_class)) {
ret = class_register(&nvmf_class);
if (ret) {
pr_err("couldn't register class nvme-fabrics\n");
ret = PTR_ERR(nvmf_class);
goto out_free_host;
}
nvmf_device =
device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
device_create(&nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
if (IS_ERR(nvmf_device)) {
pr_err("couldn't create nvme-fabrics device!\n");
ret = PTR_ERR(nvmf_device);
@ -1463,9 +1465,9 @@ static int __init nvmf_init(void)
return 0;
out_destroy_device:
device_destroy(nvmf_class, MKDEV(0, 0));
device_destroy(&nvmf_class, MKDEV(0, 0));
out_destroy_class:
class_destroy(nvmf_class);
class_unregister(&nvmf_class);
out_free_host:
nvmf_host_put(nvmf_default_host);
return ret;
@ -1474,8 +1476,8 @@ out_free_host:
static void __exit nvmf_exit(void)
{
misc_deregister(&nvmf_misc);
device_destroy(nvmf_class, MKDEV(0, 0));
class_destroy(nvmf_class);
device_destroy(&nvmf_class, MKDEV(0, 0));
class_unregister(&nvmf_class);
nvmf_host_put(nvmf_default_host);
BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64);

View File

@ -516,6 +516,7 @@ static void nvme_requeue_work(struct work_struct *work)
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
struct queue_limits lim;
bool vwc = false;
mutex_init(&head->lock);
@ -532,9 +533,14 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
!nvme_is_unique_nsid(ctrl, head) || !multipath)
return 0;
head->disk = blk_alloc_disk(ctrl->numa_node);
if (!head->disk)
return -ENOMEM;
blk_set_stacking_limits(&lim);
lim.dma_alignment = 3;
if (head->ids.csi != NVME_CSI_ZNS)
lim.max_zone_append_sectors = 0;
head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
if (IS_ERR(head->disk))
return PTR_ERR(head->disk);
head->disk->fops = &nvme_ns_head_ops;
head->disk->private_data = head;
sprintf(head->disk->disk_name, "nvme%dn%d",
@ -553,11 +559,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
/* set to a default value of 512 until the disk is validated */
blk_queue_logical_block_size(head->disk->queue, 512);
blk_set_stacking_limits(&head->disk->queue->limits);
blk_queue_dma_alignment(head->disk->queue, 3);
/* we need to propagate up the VMC settings */
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;

View File

@ -464,6 +464,7 @@ struct nvme_ns_head {
u16 ms;
u16 pi_size;
u8 pi_type;
u8 pi_offset;
u8 guard_type;
u16 sgs;
u32 sws;
@ -1035,11 +1036,11 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
}
#endif /* CONFIG_NVME_MULTIPATH */
int nvme_revalidate_zones(struct nvme_ns *ns);
int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf,
struct queue_limits *lim);
#ifdef CONFIG_BLK_DEV_ZONED
int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf);
blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd,
enum nvme_zone_mgmt_action action);
@ -1050,13 +1051,6 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
{
return BLK_STS_NOTSUPP;
}
static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
{
dev_warn(ns->ctrl->device,
"Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
return -EPROTONOSUPPORT;
}
#endif
static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)

View File

@ -1006,6 +1006,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
{
int ret;
bool changed;
u16 max_queue_size;
ret = nvme_rdma_configure_admin_queue(ctrl, new);
if (ret)
@ -1030,11 +1031,16 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
}
if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) {
if (ctrl->ctrl.max_integrity_segments)
max_queue_size = NVME_RDMA_MAX_METADATA_QUEUE_SIZE;
else
max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE;
if (ctrl->ctrl.sqsize + 1 > max_queue_size) {
dev_warn(ctrl->ctrl.device,
"ctrl sqsize %u > max queue size %u, clamping down\n",
ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE);
ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1;
"ctrl sqsize %u > max queue size %u, clamping down\n",
ctrl->ctrl.sqsize + 1, max_queue_size);
ctrl->ctrl.sqsize = max_queue_size - 1;
}
if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {

View File

@ -221,14 +221,11 @@ static int ns_update_nuse(struct nvme_ns *ns)
ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id);
if (ret)
goto out_free_id;
return ret;
ns->head->nuse = le64_to_cpu(id->nuse);
out_free_id:
kfree(id);
return ret;
return 0;
}
static ssize_t nuse_show(struct device *dev, struct device_attribute *attr,

View File

@ -7,16 +7,6 @@
#include <linux/vmalloc.h>
#include "nvme.h"
int nvme_revalidate_zones(struct nvme_ns *ns)
{
struct request_queue *q = ns->queue;
blk_queue_chunk_sectors(q, ns->head->zsze);
blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
return blk_revalidate_disk_zones(ns->disk, NULL);
}
static int nvme_set_max_append(struct nvme_ctrl *ctrl)
{
struct nvme_command c = { };
@ -45,10 +35,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl)
return 0;
}
int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf,
struct queue_limits *lim)
{
struct nvme_effects_log *log = ns->head->effects;
struct request_queue *q = ns->queue;
struct nvme_command c = { };
struct nvme_id_ns_zns *id;
int status;
@ -109,10 +99,12 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
goto free_data;
}
disk_set_zoned(ns->disk);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1);
disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue);
lim->zoned = 1;
lim->max_open_zones = le32_to_cpu(id->mor) + 1;
lim->max_active_zones = le32_to_cpu(id->mar) + 1;
lim->chunk_sectors = ns->head->zsze;
lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
free_data:
kfree(id);
return status;

View File

@ -428,7 +428,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->cqes = (0x4 << 4) | 0x4;
/* no enforcement soft-limit for maxcmd - pick arbitrary high value */
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl));
id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES);
id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);

View File

@ -273,6 +273,32 @@ static ssize_t nvmet_param_inline_data_size_store(struct config_item *item,
CONFIGFS_ATTR(nvmet_, param_inline_data_size);
static ssize_t nvmet_param_max_queue_size_show(struct config_item *item,
char *page)
{
struct nvmet_port *port = to_nvmet_port(item);
return snprintf(page, PAGE_SIZE, "%d\n", port->max_queue_size);
}
static ssize_t nvmet_param_max_queue_size_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_port *port = to_nvmet_port(item);
int ret;
if (nvmet_is_port_enabled(port, __func__))
return -EACCES;
ret = kstrtoint(page, 0, &port->max_queue_size);
if (ret) {
pr_err("Invalid value '%s' for max_queue_size\n", page);
return -EINVAL;
}
return count;
}
CONFIGFS_ATTR(nvmet_, param_max_queue_size);
#ifdef CONFIG_BLK_DEV_INTEGRITY
static ssize_t nvmet_param_pi_enable_show(struct config_item *item,
char *page)
@ -1859,6 +1885,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
&nvmet_attr_addr_trtype,
&nvmet_attr_addr_tsas,
&nvmet_attr_param_inline_data_size,
&nvmet_attr_param_max_queue_size,
#ifdef CONFIG_BLK_DEV_INTEGRITY
&nvmet_attr_param_pi_enable,
#endif
@ -1917,6 +1944,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
INIT_LIST_HEAD(&port->subsystems);
INIT_LIST_HEAD(&port->referrals);
port->inline_data_size = -1; /* < 0 == let the transport choose */
port->max_queue_size = -1; /* < 0 == let the transport choose */
port->disc_addr.portid = cpu_to_le16(portid);
port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX;

Some files were not shown because too many files have changed in this diff Show More