Merge tag 'for-6.1/block-2022-10-03' of git://git.kernel.dk/linux

Pull block updates from Jens Axboe:

 - NVMe pull requests via Christoph:
      - handle number of queue changes in the TCP and RDMA drivers
        (Daniel Wagner)
      - allow changing the number of queues in nvmet (Daniel Wagner)
      - also consider host_iface when checking ip options (Daniel
        Wagner)
      - don't map pages which can't come from HIGHMEM (Fabio M. De
        Francesco)
      - avoid unnecessary flush bios in nvmet (Guixin Liu)
      - shrink and better pack the nvme_iod structure (Keith Busch)
      - add comment for unaligned "fake" nqn (Linjun Bao)
      - print actual source IP address through sysfs "address" attr
        (Martin Belanger)
      - various cleanups (Jackie Liu, Wolfram Sang, Genjian Zhang)
      - handle effects after freeing the request (Keith Busch)
      - copy firmware_rev on each init (Keith Busch)
      - restrict management ioctls to admin (Keith Busch)
      - ensure subsystem reset is single threaded (Keith Busch)
      - report the actual number of tagset maps in nvme-pci (Keith
        Busch)
      - small fabrics authentication fixups (Christoph Hellwig)
      - add common code for tagset allocation and freeing (Christoph
        Hellwig)
      - stop using the request_queue in nvmet (Christoph Hellwig)
      - set min_align_mask before calculating max_hw_sectors (Rishabh
        Bhatnagar)
      - send a rediscover uevent when a persistent discovery controller
        reconnects (Sagi Grimberg)
      - misc nvmet-tcp fixes (Varun Prakash, zhenwei pi)

 - MD pull request via Song:
      - Various raid5 fix and clean up, by Logan Gunthorpe and David
        Sloan.
      - Raid10 performance optimization, by Yu Kuai.

 - sbitmap wakeup hang fixes (Hugh, Keith, Jan, Yu)

 - IO scheduler switching quisce fix (Keith)

 - s390/dasd block driver updates (Stefan)

 - support for recovery for the ublk driver (ZiyangZhang)

 - rnbd drivers fixes and updates (Guoqing, Santosh, ye, Christoph)

 - blk-mq and null_blk map fixes (Bart)

 - various bcache fixes (Coly, Jilin, Jules)

 - nbd signal hang fix (Shigeru)

 - block writeback throttling fix (Yu)

 - optimize the passthrough mapping handling (me)

 - prepare block cgroups to being gendisk based (Christoph)

 - get rid of an old PSI hack in the block layer, moving it to the
   callers instead where it belongs (Christoph)

 - blk-throttle fixes and cleanups (Yu)

 - misc fixes and cleanups (Liu Shixin, Liu Song, Miaohe, Pankaj,
   Ping-Xiang, Wolfram, Saurabh, Li Jinlin, Li Lei, Lin, Li zeming,
   Miaohe, Bart, Coly, Gaosheng

* tag 'for-6.1/block-2022-10-03' of git://git.kernel.dk/linux: (162 commits)
  sbitmap: fix lockup while swapping
  block: add rationale for not using blk_mq_plug() when applicable
  block: adapt blk_mq_plug() to not plug for writes that require a zone lock
  s390/dasd: use blk_mq_alloc_disk
  blk-cgroup: don't update the blkg lookup hint in blkg_conf_prep
  nvmet: don't look at the request_queue in nvmet_bdev_set_limits
  nvmet: don't look at the request_queue in nvmet_bdev_zone_mgmt_emulate_all
  blk-mq: use quiesced elevator switch when reinitializing queues
  block: replace blk_queue_nowait with bdev_nowait
  nvme: remove nvme_ctrl_init_connect_q
  nvme-loop: use the tagset alloc/free helpers
  nvme-loop: store the generic nvme_ctrl in set->driver_data
  nvme-loop: initialize sqsize later
  nvme-fc: use the tagset alloc/free helpers
  nvme-fc: store the generic nvme_ctrl in set->driver_data
  nvme-fc: keep ctrl->sqsize in sync with opts->queue_size
  nvme-rdma: use the tagset alloc/free helpers
  nvme-rdma: store the generic nvme_ctrl in set->driver_data
  nvme-tcp: use the tagset alloc/free helpers
  nvme-tcp: store the generic nvme_ctrl in set->driver_data
  ...
This commit is contained in:
Linus Torvalds
2022-10-07 09:19:14 -07:00
135 changed files with 3209 additions and 1649 deletions

View File

@@ -14582,6 +14582,15 @@ F: drivers/nvme/common/
F: include/linux/nvme* F: include/linux/nvme*
F: include/uapi/linux/nvme_ioctl.h F: include/uapi/linux/nvme_ioctl.h
NVM EXPRESS FABRICS AUTHENTICATION
M: Hannes Reinecke <hare@suse.de>
L: linux-nvme@lists.infradead.org
S: Supported
F: drivers/nvme/host/auth.c
F: drivers/nvme/target/auth.c
F: drivers/nvme/target/fabrics-cmd-auth.c
F: include/linux/nvme-auth.h
NVM EXPRESS FC TRANSPORT DRIVERS NVM EXPRESS FC TRANSPORT DRIVERS
M: James Smart <james.smart@broadcom.com> M: James Smart <james.smart@broadcom.com>
L: linux-nvme@lists.infradead.org L: linux-nvme@lists.infradead.org

View File

@@ -215,6 +215,11 @@ union scsw {
#define SNS2_ENV_DATA_PRESENT 0x10 #define SNS2_ENV_DATA_PRESENT 0x10
#define SNS2_INPRECISE_END 0x04 #define SNS2_INPRECISE_END 0x04
/*
* architectured values for PPRC errors
*/
#define SNS7_INVALID_ON_SEC 0x0e
/** /**
* scsw_is_tm - check for transport mode scsw * scsw_is_tm - check for transport mode scsw
* @scsw: pointer to scsw * @scsw: pointer to scsw

View File

@@ -182,6 +182,18 @@ typedef struct format_data_t {
unsigned int intensity; unsigned int intensity;
} format_data_t; } format_data_t;
/*
* struct dasd_copypair_swap_data_t
* represents all data necessary to issue a swap of the copy pair relation
*/
struct dasd_copypair_swap_data_t {
char primary[20]; /* BUSID of primary */
char secondary[20]; /* BUSID of secondary */
/* Reserved for future updates. */
__u8 reserved[64];
};
/* /*
* values to be used for format_data_t.intensity * values to be used for format_data_t.intensity
* 0/8: normal format * 0/8: normal format
@@ -326,6 +338,8 @@ struct dasd_snid_ioctl_data {
#define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t) #define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t)
/* Release Allocated Space */ /* Release Allocated Space */
#define BIODASDRAS _IOW(DASD_IOCTL_LETTER, 3, format_data_t) #define BIODASDRAS _IOW(DASD_IOCTL_LETTER, 3, format_data_t)
/* Swap copy pair relation */
#define BIODASDCOPYPAIRSWAP _IOW(DASD_IOCTL_LETTER, 4, struct dasd_copypair_swap_data_t)
/* Get Sense Path Group ID (SNID) data */ /* Get Sense Path Group ID (SNID) data */
#define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data) #define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data)

View File

@@ -254,17 +254,12 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
#else /* CONFIG_BFQ_CGROUP_DEBUG */ #else /* CONFIG_BFQ_CGROUP_DEBUG */
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
blk_opf_t opf) { }
void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { }
void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { }
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
u64 io_start_time_ns, blk_opf_t opf) { } u64 io_start_time_ns, blk_opf_t opf) { }
void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
#endif /* CONFIG_BFQ_CGROUP_DEBUG */ #endif /* CONFIG_BFQ_CGROUP_DEBUG */

View File

@@ -1925,7 +1925,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
bfqq->service_from_backlogged = 0; bfqq->service_from_backlogged = 0;
bfq_clear_bfqq_softrt_update(bfqq); bfq_clear_bfqq_softrt_update(bfqq);
bfq_add_bfqq_busy(bfqd, bfqq); bfq_add_bfqq_busy(bfqq);
/* /*
* Expire in-service queue if preemption may be needed for * Expire in-service queue if preemption may be needed for
@@ -2419,7 +2419,7 @@ static void bfq_remove_request(struct request_queue *q,
bfqq->next_rq = NULL; bfqq->next_rq = NULL;
if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
bfq_del_bfqq_busy(bfqd, bfqq, false); bfq_del_bfqq_busy(bfqq, false);
/* /*
* bfqq emptied. In normal operation, when * bfqq emptied. In normal operation, when
* bfqq is empty, bfqq->entity.service and * bfqq is empty, bfqq->entity.service and
@@ -3098,7 +3098,7 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
*/ */
if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
bfqq != bfqd->in_service_queue) bfqq != bfqd->in_service_queue)
bfq_del_bfqq_busy(bfqd, bfqq, false); bfq_del_bfqq_busy(bfqq, false);
bfq_reassign_last_bfqq(bfqq, NULL); bfq_reassign_last_bfqq(bfqq, NULL);
@@ -3908,7 +3908,7 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/ */
bfqq->budget_timeout = jiffies; bfqq->budget_timeout = jiffies;
bfq_del_bfqq_busy(bfqd, bfqq, true); bfq_del_bfqq_busy(bfqq, true);
} else { } else {
bfq_requeue_bfqq(bfqd, bfqq, true); bfq_requeue_bfqq(bfqd, bfqq, true);
/* /*
@@ -5255,9 +5255,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
struct hlist_node *n; struct hlist_node *n;
struct bfq_group *bfqg = bfqq_group(bfqq); struct bfq_group *bfqg = bfqq_group(bfqq);
if (bfqq->bfqd) bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
bfqq, bfqq->ref);
bfqq->ref--; bfqq->ref--;
if (bfqq->ref) if (bfqq->ref)
@@ -5321,7 +5319,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
hlist_del_init(&item->woken_list_node); hlist_del_init(&item->woken_list_node);
} }
if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq) if (bfqq->bfqd->last_completed_rq_bfqq == bfqq)
bfqq->bfqd->last_completed_rq_bfqq = NULL; bfqq->bfqd->last_completed_rq_bfqq = NULL;
kmem_cache_free(bfq_pool, bfqq); kmem_cache_free(bfq_pool, bfqq);

View File

@@ -993,20 +993,23 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
/* ---------------- cgroups-support interface ---------------- */ /* ---------------- cgroups-support interface ---------------- */
void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq);
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
blk_opf_t opf);
void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf); void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf);
void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf); void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf);
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
u64 io_start_time_ns, blk_opf_t opf); u64 io_start_time_ns, blk_opf_t opf);
void bfqg_stats_update_dequeue(struct bfq_group *bfqg); void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg); void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);
void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg); struct bfq_group *bfqg);
#ifdef CONFIG_BFQ_CGROUP_DEBUG
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
blk_opf_t opf);
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
#endif
void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);
void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);
void bfq_end_wr_async(struct bfq_data *bfqd); void bfq_end_wr_async(struct bfq_data *bfqd);
@@ -1077,9 +1080,8 @@ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool expiration); bool expiration);
void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
bool expiration); void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */ /* --------------- end of interface of B-WF2Q+ ---------------- */

View File

@@ -1651,9 +1651,10 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* the service tree. As a special case, it can be invoked during an * the service tree. As a special case, it can be invoked during an
* expiration. * expiration.
*/ */
void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration)
bool expiration)
{ {
struct bfq_data *bfqd = bfqq->bfqd;
bfq_log_bfqq(bfqd, bfqq, "del from busy"); bfq_log_bfqq(bfqd, bfqq, "del from busy");
bfq_clear_bfqq_busy(bfqq); bfq_clear_bfqq_busy(bfqq);
@@ -1674,8 +1675,10 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/* /*
* Called when an inactive queue receives a new request. * Called when an inactive queue receives a new request.
*/ */
void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) void bfq_add_bfqq_busy(struct bfq_queue *bfqq)
{ {
struct bfq_data *bfqd = bfqq->bfqd;
bfq_log_bfqq(bfqd, bfqq, "add to busy"); bfq_log_bfqq(bfqd, bfqq, "add to busy");
bfq_activate_bfqq(bfqd, bfqq); bfq_activate_bfqq(bfqd, bfqq);

View File

@@ -760,8 +760,6 @@ EXPORT_SYMBOL(bio_put);
static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{ {
bio_set_flag(bio, BIO_CLONED); bio_set_flag(bio, BIO_CLONED);
if (bio_flagged(bio_src, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_iter = bio_src->bi_iter; bio->bi_iter = bio_src->bi_iter;
@@ -1065,9 +1063,6 @@ void __bio_add_page(struct bio *bio, struct page *page,
bio->bi_iter.bi_size += len; bio->bi_iter.bi_size += len;
bio->bi_vcnt++; bio->bi_vcnt++;
if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
bio_set_flag(bio, BIO_WORKINGSET);
} }
EXPORT_SYMBOL_GPL(__bio_add_page); EXPORT_SYMBOL_GPL(__bio_add_page);
@@ -1276,9 +1271,6 @@ out:
* fit into the bio, or are requested in @iter, whatever is smaller. If * fit into the bio, or are requested in @iter, whatever is smaller. If
* MM encounters an error pinning the requested pages, it stops. Error * MM encounters an error pinning the requested pages, it stops. Error
* is returned only if 0 pages could be pinned. * is returned only if 0 pages could be pinned.
*
* It's intended for direct IO, so doesn't do PSI tracking, the caller is
* responsible for setting BIO_WORKINGSET if necessary.
*/ */
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{ {
@@ -1294,8 +1286,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
ret = __bio_iov_iter_get_pages(bio, iter); ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
/* don't account direct I/O as memory stall */
bio_clear_flag(bio, BIO_WORKINGSET);
return bio->bi_vcnt ? 0 : ret; return bio->bi_vcnt ? 0 : ret;
} }
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
@@ -1754,7 +1744,8 @@ static int __init init_bio(void)
cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
bio_cpu_dead); bio_cpu_dead);
if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0,
BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
panic("bio: can't allocate bios\n"); panic("bio: can't allocate bios\n");
if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))

View File

@@ -202,19 +202,19 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
/** /**
* blkg_alloc - allocate a blkg * blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with * @blkcg: block cgroup the new blkg is associated with
* @q: request_queue the new blkg is associated with * @disk: gendisk the new blkg is associated with
* @gfp_mask: allocation mask to use * @gfp_mask: allocation mask to use
* *
* Allocate a new blkg assocating @blkcg and @q. * Allocate a new blkg assocating @blkcg and @q.
*/ */
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
gfp_t gfp_mask) gfp_t gfp_mask)
{ {
struct blkcg_gq *blkg; struct blkcg_gq *blkg;
int i, cpu; int i, cpu;
/* alloc and init base part */ /* alloc and init base part */
blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node);
if (!blkg) if (!blkg)
return NULL; return NULL;
@@ -225,10 +225,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
if (!blkg->iostat_cpu) if (!blkg->iostat_cpu)
goto err_free; goto err_free;
if (!blk_get_queue(q)) if (!blk_get_queue(disk->queue))
goto err_free; goto err_free;
blkg->q = q; blkg->q = disk->queue;
INIT_LIST_HEAD(&blkg->q_node); INIT_LIST_HEAD(&blkg->q_node);
spin_lock_init(&blkg->async_bio_lock); spin_lock_init(&blkg->async_bio_lock);
bio_list_init(&blkg->async_bios); bio_list_init(&blkg->async_bios);
@@ -243,11 +243,11 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy *pol = blkcg_policy[i];
struct blkg_policy_data *pd; struct blkg_policy_data *pd;
if (!blkcg_policy_enabled(q, pol)) if (!blkcg_policy_enabled(disk->queue, pol))
continue; continue;
/* alloc per-policy data and attach it to blkg */ /* alloc per-policy data and attach it to blkg */
pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); pd = pol->pd_alloc_fn(gfp_mask, disk->queue, blkcg);
if (!pd) if (!pd)
goto err_free; goto err_free;
@@ -263,45 +263,20 @@ err_free:
return NULL; return NULL;
} }
struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
struct request_queue *q, bool update_hint)
{
struct blkcg_gq *blkg;
/*
* Hint didn't match. Look up from the radix tree. Note that the
* hint can only be updated under queue_lock as otherwise @blkg
* could have already been removed from blkg_tree. The caller is
* responsible for grabbing queue_lock if @update_hint.
*/
blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
if (blkg && blkg->q == q) {
if (update_hint) {
lockdep_assert_held(&q->queue_lock);
rcu_assign_pointer(blkcg->blkg_hint, blkg);
}
return blkg;
}
return NULL;
}
EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
/* /*
* If @new_blkg is %NULL, this function tries to allocate a new one as * If @new_blkg is %NULL, this function tries to allocate a new one as
* necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
*/ */
static struct blkcg_gq *blkg_create(struct blkcg *blkcg, static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
struct request_queue *q,
struct blkcg_gq *new_blkg) struct blkcg_gq *new_blkg)
{ {
struct blkcg_gq *blkg; struct blkcg_gq *blkg;
int i, ret; int i, ret;
lockdep_assert_held(&q->queue_lock); lockdep_assert_held(&disk->queue->queue_lock);
/* request_queue is dying, do not create/recreate a blkg */ /* request_queue is dying, do not create/recreate a blkg */
if (blk_queue_dying(q)) { if (blk_queue_dying(disk->queue)) {
ret = -ENODEV; ret = -ENODEV;
goto err_free_blkg; goto err_free_blkg;
} }
@@ -314,7 +289,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* allocate */ /* allocate */
if (!new_blkg) { if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN);
if (unlikely(!new_blkg)) { if (unlikely(!new_blkg)) {
ret = -ENOMEM; ret = -ENOMEM;
goto err_put_css; goto err_put_css;
@@ -324,7 +299,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* link parent */ /* link parent */
if (blkcg_parent(blkcg)) { if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
if (WARN_ON_ONCE(!blkg->parent)) { if (WARN_ON_ONCE(!blkg->parent)) {
ret = -ENODEV; ret = -ENODEV;
goto err_put_css; goto err_put_css;
@@ -342,10 +317,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* insert */ /* insert */
spin_lock(&blkcg->lock); spin_lock(&blkcg->lock);
ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg);
if (likely(!ret)) { if (likely(!ret)) {
hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
list_add(&blkg->q_node, &q->blkg_list); list_add(&blkg->q_node, &disk->queue->blkg_list);
for (i = 0; i < BLKCG_MAX_POLS; i++) { for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy *pol = blkcg_policy[i];
@@ -374,19 +349,20 @@ err_free_blkg:
/** /**
* blkg_lookup_create - lookup blkg, try to create one if not there * blkg_lookup_create - lookup blkg, try to create one if not there
* @blkcg: blkcg of interest * @blkcg: blkcg of interest
* @q: request_queue of interest * @disk: gendisk of interest
* *
* Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to
* create one. blkg creation is performed recursively from blkcg_root such * create one. blkg creation is performed recursively from blkcg_root such
* that all non-root blkg's have access to the parent blkg. This function * that all non-root blkg's have access to the parent blkg. This function
* should be called under RCU read lock and takes @q->queue_lock. * should be called under RCU read lock and takes @disk->queue->queue_lock.
* *
* Returns the blkg or the closest blkg if blkg_create() fails as it walks * Returns the blkg or the closest blkg if blkg_create() fails as it walks
* down from root. * down from root.
*/ */
static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
struct request_queue *q) struct gendisk *disk)
{ {
struct request_queue *q = disk->queue;
struct blkcg_gq *blkg; struct blkcg_gq *blkg;
unsigned long flags; unsigned long flags;
@@ -397,9 +373,13 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
return blkg; return blkg;
spin_lock_irqsave(&q->queue_lock, flags); spin_lock_irqsave(&q->queue_lock, flags);
blkg = __blkg_lookup(blkcg, q, true); blkg = blkg_lookup(blkcg, q);
if (blkg) if (blkg) {
if (blkcg != &blkcg_root &&
blkg != rcu_dereference(blkcg->blkg_hint))
rcu_assign_pointer(blkcg->blkg_hint, blkg);
goto found; goto found;
}
/* /*
* Create blkgs walking down from blkcg_root to @blkcg, so that all * Create blkgs walking down from blkcg_root to @blkcg, so that all
@@ -412,7 +392,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
struct blkcg_gq *ret_blkg = q->root_blkg; struct blkcg_gq *ret_blkg = q->root_blkg;
while (parent) { while (parent) {
blkg = __blkg_lookup(parent, q, false); blkg = blkg_lookup(parent, q);
if (blkg) { if (blkg) {
/* remember closest blkg */ /* remember closest blkg */
ret_blkg = blkg; ret_blkg = blkg;
@@ -422,7 +402,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
parent = blkcg_parent(parent); parent = blkcg_parent(parent);
} }
blkg = blkg_create(pos, q, NULL); blkg = blkg_create(pos, disk, NULL);
if (IS_ERR(blkg)) { if (IS_ERR(blkg)) {
blkg = ret_blkg; blkg = ret_blkg;
break; break;
@@ -476,14 +456,9 @@ static void blkg_destroy(struct blkcg_gq *blkg)
percpu_ref_kill(&blkg->refcnt); percpu_ref_kill(&blkg->refcnt);
} }
/** static void blkg_destroy_all(struct gendisk *disk)
* blkg_destroy_all - destroy all blkgs associated with a request_queue
* @q: request_queue of interest
*
* Destroy all blkgs associated with @q.
*/
static void blkg_destroy_all(struct request_queue *q)
{ {
struct request_queue *q = disk->queue;
struct blkcg_gq *blkg, *n; struct blkcg_gq *blkg, *n;
int count = BLKG_DESTROY_BATCH_SIZE; int count = BLKG_DESTROY_BATCH_SIZE;
@@ -616,19 +591,6 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
} }
EXPORT_SYMBOL_GPL(__blkg_prfill_u64); EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
/* Performs queue bypass and policy enabled checks then looks up blkg. */
static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
const struct blkcg_policy *pol,
struct request_queue *q)
{
WARN_ON_ONCE(!rcu_read_lock_held());
lockdep_assert_held(&q->queue_lock);
if (!blkcg_policy_enabled(q, pol))
return ERR_PTR(-EOPNOTSUPP);
return __blkg_lookup(blkcg, q, true /* update_hint */);
}
/** /**
* blkcg_conf_open_bdev - parse and open bdev for per-blkg config update * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update
* @inputp: input string pointer * @inputp: input string pointer
@@ -684,6 +646,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
__acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
{ {
struct block_device *bdev; struct block_device *bdev;
struct gendisk *disk;
struct request_queue *q; struct request_queue *q;
struct blkcg_gq *blkg; struct blkcg_gq *blkg;
int ret; int ret;
@@ -691,8 +654,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
bdev = blkcg_conf_open_bdev(&input); bdev = blkcg_conf_open_bdev(&input);
if (IS_ERR(bdev)) if (IS_ERR(bdev))
return PTR_ERR(bdev); return PTR_ERR(bdev);
disk = bdev->bd_disk;
q = bdev_get_queue(bdev); q = disk->queue;
/* /*
* blkcg_deactivate_policy() requires queue to be frozen, we can grab * blkcg_deactivate_policy() requires queue to be frozen, we can grab
@@ -705,12 +668,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
rcu_read_lock(); rcu_read_lock();
spin_lock_irq(&q->queue_lock); spin_lock_irq(&q->queue_lock);
blkg = blkg_lookup_check(blkcg, pol, q); if (!blkcg_policy_enabled(q, pol)) {
if (IS_ERR(blkg)) { ret = -EOPNOTSUPP;
ret = PTR_ERR(blkg);
goto fail_unlock; goto fail_unlock;
} }
blkg = blkg_lookup(blkcg, q);
if (blkg) if (blkg)
goto success; goto success;
@@ -724,7 +687,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkcg_gq *new_blkg; struct blkcg_gq *new_blkg;
parent = blkcg_parent(blkcg); parent = blkcg_parent(blkcg);
while (parent && !__blkg_lookup(parent, q, false)) { while (parent && !blkg_lookup(parent, q)) {
pos = parent; pos = parent;
parent = blkcg_parent(parent); parent = blkcg_parent(parent);
} }
@@ -733,7 +696,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
spin_unlock_irq(&q->queue_lock); spin_unlock_irq(&q->queue_lock);
rcu_read_unlock(); rcu_read_unlock();
new_blkg = blkg_alloc(pos, q, GFP_KERNEL); new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
if (unlikely(!new_blkg)) { if (unlikely(!new_blkg)) {
ret = -ENOMEM; ret = -ENOMEM;
goto fail_exit_queue; goto fail_exit_queue;
@@ -748,17 +711,17 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
rcu_read_lock(); rcu_read_lock();
spin_lock_irq(&q->queue_lock); spin_lock_irq(&q->queue_lock);
blkg = blkg_lookup_check(pos, pol, q); if (!blkcg_policy_enabled(q, pol)) {
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
blkg_free(new_blkg); blkg_free(new_blkg);
ret = -EOPNOTSUPP;
goto fail_preloaded; goto fail_preloaded;
} }
blkg = blkg_lookup(pos, q);
if (blkg) { if (blkg) {
blkg_free(new_blkg); blkg_free(new_blkg);
} else { } else {
blkg = blkg_create(pos, q, new_blkg); blkg = blkg_create(pos, disk, new_blkg);
if (IS_ERR(blkg)) { if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg); ret = PTR_ERR(blkg);
goto fail_preloaded; goto fail_preloaded;
@@ -915,8 +878,7 @@ static void blkcg_fill_root_iostats(void)
class_dev_iter_init(&iter, &block_class, NULL, &disk_type); class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
while ((dev = class_dev_iter_next(&iter))) { while ((dev = class_dev_iter_next(&iter))) {
struct block_device *bdev = dev_to_bdev(dev); struct block_device *bdev = dev_to_bdev(dev);
struct blkcg_gq *blkg = struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
blk_queue_root_blkg(bdev_get_queue(bdev));
struct blkg_iostat tmp; struct blkg_iostat tmp;
int cpu; int cpu;
unsigned long flags; unsigned long flags;
@@ -1255,25 +1217,16 @@ static int blkcg_css_online(struct cgroup_subsys_state *css)
return 0; return 0;
} }
/** int blkcg_init_disk(struct gendisk *disk)
* blkcg_init_queue - initialize blkcg part of request queue
* @q: request_queue to initialize
*
* Called from blk_alloc_queue(). Responsible for initializing blkcg
* part of new request_queue @q.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int blkcg_init_queue(struct request_queue *q)
{ {
struct request_queue *q = disk->queue;
struct blkcg_gq *new_blkg, *blkg; struct blkcg_gq *new_blkg, *blkg;
bool preloaded; bool preloaded;
int ret; int ret;
INIT_LIST_HEAD(&q->blkg_list); INIT_LIST_HEAD(&q->blkg_list);
new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
if (!new_blkg) if (!new_blkg)
return -ENOMEM; return -ENOMEM;
@@ -1282,7 +1235,7 @@ int blkcg_init_queue(struct request_queue *q)
/* Make sure the root blkg exists. */ /* Make sure the root blkg exists. */
/* spin_lock_irq can serve as RCU read-side critical section. */ /* spin_lock_irq can serve as RCU read-side critical section. */
spin_lock_irq(&q->queue_lock); spin_lock_irq(&q->queue_lock);
blkg = blkg_create(&blkcg_root, q, new_blkg); blkg = blkg_create(&blkcg_root, disk, new_blkg);
if (IS_ERR(blkg)) if (IS_ERR(blkg))
goto err_unlock; goto err_unlock;
q->root_blkg = blkg; q->root_blkg = blkg;
@@ -1291,25 +1244,26 @@ int blkcg_init_queue(struct request_queue *q)
if (preloaded) if (preloaded)
radix_tree_preload_end(); radix_tree_preload_end();
ret = blk_ioprio_init(q); ret = blk_ioprio_init(disk);
if (ret) if (ret)
goto err_destroy_all; goto err_destroy_all;
ret = blk_throtl_init(q); ret = blk_throtl_init(disk);
if (ret) if (ret)
goto err_destroy_all; goto err_ioprio_exit;
ret = blk_iolatency_init(q); ret = blk_iolatency_init(disk);
if (ret) { if (ret)
blk_throtl_exit(q); goto err_throtl_exit;
blk_ioprio_exit(q);
goto err_destroy_all;
}
return 0; return 0;
err_throtl_exit:
blk_throtl_exit(disk);
err_ioprio_exit:
blk_ioprio_exit(disk);
err_destroy_all: err_destroy_all:
blkg_destroy_all(q); blkg_destroy_all(disk);
return ret; return ret;
err_unlock: err_unlock:
spin_unlock_irq(&q->queue_lock); spin_unlock_irq(&q->queue_lock);
@@ -1318,16 +1272,10 @@ err_unlock:
return PTR_ERR(blkg); return PTR_ERR(blkg);
} }
/** void blkcg_exit_disk(struct gendisk *disk)
* blkcg_exit_queue - exit and release blkcg part of request_queue
* @q: request_queue being released
*
* Called from blk_exit_queue(). Responsible for exiting blkcg part.
*/
void blkcg_exit_queue(struct request_queue *q)
{ {
blkg_destroy_all(q); blkg_destroy_all(disk);
blk_throtl_exit(q); blk_throtl_exit(disk);
} }
static void blkcg_bind(struct cgroup_subsys_state *root_css) static void blkcg_bind(struct cgroup_subsys_state *root_css)
@@ -1836,13 +1784,13 @@ out:
/** /**
* blkcg_schedule_throttle - this task needs to check for throttling * blkcg_schedule_throttle - this task needs to check for throttling
* @q: the request queue IO was submitted on * @gendisk: disk to throttle
* @use_memdelay: do we charge this to memory delay for PSI * @use_memdelay: do we charge this to memory delay for PSI
* *
* This is called by the IO controller when we know there's delay accumulated * This is called by the IO controller when we know there's delay accumulated
* for the blkg for this task. We do not pass the blkg because there are places * for the blkg for this task. We do not pass the blkg because there are places
* we call this that may not have that information, the swapping code for * we call this that may not have that information, the swapping code for
* instance will only have a request_queue at that point. This set's the * instance will only have a block_device at that point. This set's the
* notify_resume for the task to check and see if it requires throttling before * notify_resume for the task to check and see if it requires throttling before
* returning to user space. * returning to user space.
* *
@@ -1851,8 +1799,10 @@ out:
* throttle once. If the task needs to be throttled again it'll need to be * throttle once. If the task needs to be throttled again it'll need to be
* re-set at the next time we see the task. * re-set at the next time we see the task.
*/ */
void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay)
{ {
struct request_queue *q = disk->queue;
if (unlikely(current->flags & PF_KTHREAD)) if (unlikely(current->flags & PF_KTHREAD))
return; return;
@@ -1902,8 +1852,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
struct blkcg_gq *blkg, *ret_blkg = NULL; struct blkcg_gq *blkg, *ret_blkg = NULL;
rcu_read_lock(); rcu_read_lock();
blkg = blkg_lookup_create(css_to_blkcg(css), blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk);
bdev_get_queue(bio->bi_bdev));
while (blkg) { while (blkg) {
if (blkg_tryget(blkg)) { if (blkg_tryget(blkg)) {
ret_blkg = blkg; ret_blkg = blkg;

View File

@@ -178,10 +178,8 @@ struct blkcg_policy {
extern struct blkcg blkcg_root; extern struct blkcg blkcg_root;
extern bool blkcg_debug_stats; extern bool blkcg_debug_stats;
struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, int blkcg_init_disk(struct gendisk *disk);
struct request_queue *q, bool update_hint); void blkcg_exit_disk(struct gendisk *disk);
int blkcg_init_queue(struct request_queue *q);
void blkcg_exit_queue(struct request_queue *q);
/* Blkio controller policy registration */ /* Blkio controller policy registration */
int blkcg_policy_register(struct blkcg_policy *pol); int blkcg_policy_register(struct blkcg_policy *pol);
@@ -227,22 +225,21 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio)
} }
/** /**
* __blkg_lookup - internal version of blkg_lookup() * blkg_lookup - lookup blkg for the specified blkcg - q pair
* @blkcg: blkcg of interest * @blkcg: blkcg of interest
* @q: request_queue of interest * @q: request_queue of interest
* @update_hint: whether to update lookup hint with the result or not
* *
* This is internal version and shouldn't be used by policy * Lookup blkg for the @blkcg - @q pair.
* implementations. Looks up blkgs for the @blkcg - @q pair regardless of
* @q's bypass state. If @update_hint is %true, the caller should be * Must be called in a RCU critical section.
* holding @q->queue_lock and lookup hint is updated on success.
*/ */
static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
struct request_queue *q, struct request_queue *q)
bool update_hint)
{ {
struct blkcg_gq *blkg; struct blkcg_gq *blkg;
WARN_ON_ONCE(!rcu_read_lock_held());
if (blkcg == &blkcg_root) if (blkcg == &blkcg_root)
return q->root_blkg; return q->root_blkg;
@@ -250,33 +247,10 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
if (blkg && blkg->q == q) if (blkg && blkg->q == q)
return blkg; return blkg;
return blkg_lookup_slowpath(blkcg, q, update_hint); blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
} if (blkg && blkg->q != q)
blkg = NULL;
/** return blkg;
* blkg_lookup - lookup blkg for the specified blkcg - q pair
* @blkcg: blkcg of interest
* @q: request_queue of interest
*
* Lookup blkg for the @blkcg - @q pair. This function should be called
* under RCU read lock.
*/
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
struct request_queue *q)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return __blkg_lookup(blkcg, q, false);
}
/**
* blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
* @q: request_queue of interest
*
* Lookup blkg for @q at the root level. See also blkg_lookup().
*/
static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
{
return q->root_blkg;
} }
/** /**
@@ -373,8 +347,8 @@ static inline void blkg_put(struct blkcg_gq *blkg)
*/ */
#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q, false))) (p_blkg)->q)))
/** /**
* blkg_for_each_descendant_post - post-order walk of a blkg's descendants * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
@@ -388,8 +362,8 @@ static inline void blkg_put(struct blkcg_gq *blkg)
*/ */
#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q, false))) (p_blkg)->q)))
bool __blkcg_punt_bio_submit(struct bio *bio); bool __blkcg_punt_bio_submit(struct bio *bio);
@@ -507,10 +481,8 @@ struct blkcg {
}; };
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q) static inline int blkcg_init_disk(struct gendisk *disk) { return 0; }
{ return NULL; } static inline void blkcg_exit_disk(struct gendisk *disk) { }
static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
static inline void blkcg_exit_queue(struct request_queue *q) { }
static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
static inline int blkcg_activate_policy(struct request_queue *q, static inline int blkcg_activate_policy(struct request_queue *q,

View File

@@ -37,7 +37,6 @@
#include <linux/t10-pi.h> #include <linux/t10-pi.h>
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/bpf.h> #include <linux/bpf.h>
#include <linux/psi.h>
#include <linux/part_stat.h> #include <linux/part_stat.h>
#include <linux/sched/sysctl.h> #include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h> #include <linux/blk-crypto.h>
@@ -487,18 +486,15 @@ static int __init fail_make_request_debugfs(void)
late_initcall(fail_make_request_debugfs); late_initcall(fail_make_request_debugfs);
#endif /* CONFIG_FAIL_MAKE_REQUEST */ #endif /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool bio_check_ro(struct bio *bio) static inline void bio_check_ro(struct bio *bio)
{ {
if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
return false; return;
pr_warn("Trying to write to read-only block-device %pg\n", pr_warn("Trying to write to read-only block-device %pg\n",
bio->bi_bdev); bio->bi_bdev);
/* Older lvm-tools actually trigger this */ /* Older lvm-tools actually trigger this */
return false;
} }
return false;
} }
static noinline int should_fail_bio(struct bio *bio) static noinline int should_fail_bio(struct bio *bio)
@@ -717,13 +713,12 @@ void submit_bio_noacct(struct bio *bio)
* For a REQ_NOWAIT based request, return -EOPNOTSUPP * For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue does not support NOWAIT. * if queue does not support NOWAIT.
*/ */
if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
goto not_supported; goto not_supported;
if (should_fail_bio(bio)) if (should_fail_bio(bio))
goto end_io; goto end_io;
if (unlikely(bio_check_ro(bio))) bio_check_ro(bio);
goto end_io;
if (!bio_flagged(bio, BIO_REMAPPED)) { if (!bio_flagged(bio, BIO_REMAPPED)) {
if (unlikely(bio_check_eod(bio))) if (unlikely(bio_check_eod(bio)))
goto end_io; goto end_io;
@@ -814,7 +809,7 @@ EXPORT_SYMBOL(submit_bio_noacct);
* *
* The success/failure status of the request, along with notification of * The success/failure status of the request, along with notification of
* completion, is delivered asynchronously through the ->bi_end_io() callback * completion, is delivered asynchronously through the ->bi_end_io() callback
* in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has
* been called. * been called.
*/ */
void submit_bio(struct bio *bio) void submit_bio(struct bio *bio)
@@ -829,22 +824,6 @@ void submit_bio(struct bio *bio)
count_vm_events(PGPGOUT, bio_sectors(bio)); count_vm_events(PGPGOUT, bio_sectors(bio));
} }
/*
* If we're reading data that is part of the userspace workingset, count
* submission time as memory stall. When the device is congested, or
* the submitting cgroup IO-throttled, submission can be a significant
* part of overall IO time.
*/
if (unlikely(bio_op(bio) == REQ_OP_READ &&
bio_flagged(bio, BIO_WORKINGSET))) {
unsigned long pflags;
psi_memstall_enter(&pflags);
submit_bio_noacct(bio);
psi_memstall_leave(&pflags);
return;
}
submit_bio_noacct(bio); submit_bio_noacct(bio);
} }
EXPORT_SYMBOL(submit_bio); EXPORT_SYMBOL(submit_bio);
@@ -871,6 +850,12 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0; return 0;
/*
* As the requests that require a zone lock are not plugged in the
* first place, directly accessing the plug instead of using
* blk_mq_plug() should not have any consequences during flushing for
* zoned devices.
*/
blk_flush_plug(current->plug, false); blk_flush_plug(current->plug, false);
if (bio_queue_enter(bio)) if (bio_queue_enter(bio))

View File

@@ -664,17 +664,13 @@ static struct ioc *q_to_ioc(struct request_queue *q)
return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
} }
static const char *q_name(struct request_queue *q)
{
if (blk_queue_registered(q))
return kobject_name(q->kobj.parent);
else
return "<unknown>";
}
static const char __maybe_unused *ioc_name(struct ioc *ioc) static const char __maybe_unused *ioc_name(struct ioc *ioc)
{ {
return q_name(ioc->rqos.q); struct gendisk *disk = ioc->rqos.q->disk;
if (!disk)
return "<unknown>";
return disk->disk_name;
} }
static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
@@ -1430,7 +1426,7 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
int flags, void *key) int flags, void *key)
{ {
struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key; struct iocg_wake_ctx *ctx = key;
u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
ctx->vbudget -= cost; ctx->vbudget -= cost;
@@ -2640,7 +2636,7 @@ retry_lock:
if (use_debt) { if (use_debt) {
iocg_incur_debt(iocg, abs_cost, &now); iocg_incur_debt(iocg, abs_cost, &now);
if (iocg_kick_delay(iocg, &now)) if (iocg_kick_delay(iocg, &now))
blkcg_schedule_throttle(rqos->q, blkcg_schedule_throttle(rqos->q->disk,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP); (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
iocg_unlock(iocg, ioc_locked, &flags); iocg_unlock(iocg, ioc_locked, &flags);
return; return;
@@ -2741,7 +2737,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
if (likely(!list_empty(&iocg->active_list))) { if (likely(!list_empty(&iocg->active_list))) {
iocg_incur_debt(iocg, abs_cost, &now); iocg_incur_debt(iocg, abs_cost, &now);
if (iocg_kick_delay(iocg, &now)) if (iocg_kick_delay(iocg, &now))
blkcg_schedule_throttle(rqos->q, blkcg_schedule_throttle(rqos->q->disk,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP); (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
} else { } else {
iocg_commit_bio(iocg, bio, abs_cost, cost); iocg_commit_bio(iocg, bio, abs_cost, cost);
@@ -2832,8 +2828,9 @@ static struct rq_qos_ops ioc_rqos_ops = {
.exit = ioc_rqos_exit, .exit = ioc_rqos_exit,
}; };
static int blk_iocost_init(struct request_queue *q) static int blk_iocost_init(struct gendisk *disk)
{ {
struct request_queue *q = disk->queue;
struct ioc *ioc; struct ioc *ioc;
struct rq_qos *rqos; struct rq_qos *rqos;
int i, cpu, ret; int i, cpu, ret;
@@ -3170,6 +3167,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off) size_t nbytes, loff_t off)
{ {
struct block_device *bdev; struct block_device *bdev;
struct gendisk *disk;
struct ioc *ioc; struct ioc *ioc;
u32 qos[NR_QOS_PARAMS]; u32 qos[NR_QOS_PARAMS];
bool enable, user; bool enable, user;
@@ -3180,12 +3178,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev)) if (IS_ERR(bdev))
return PTR_ERR(bdev); return PTR_ERR(bdev);
ioc = q_to_ioc(bdev_get_queue(bdev)); disk = bdev->bd_disk;
ioc = q_to_ioc(disk->queue);
if (!ioc) { if (!ioc) {
ret = blk_iocost_init(bdev_get_queue(bdev)); ret = blk_iocost_init(disk);
if (ret) if (ret)
goto err; goto err;
ioc = q_to_ioc(bdev_get_queue(bdev)); ioc = q_to_ioc(disk->queue);
} }
spin_lock_irq(&ioc->lock); spin_lock_irq(&ioc->lock);
@@ -3262,11 +3261,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
spin_lock_irq(&ioc->lock); spin_lock_irq(&ioc->lock);
if (enable) { if (enable) {
blk_stat_enable_accounting(ioc->rqos.q); blk_stat_enable_accounting(disk->queue);
blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = true; ioc->enabled = true;
} else { } else {
blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = false; ioc->enabled = false;
} }
@@ -3349,7 +3348,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(bdev_get_queue(bdev)); ioc = q_to_ioc(bdev_get_queue(bdev));
if (!ioc) { if (!ioc) {
ret = blk_iocost_init(bdev_get_queue(bdev)); ret = blk_iocost_init(bdev->bd_disk);
if (ret) if (ret)
goto err; goto err;
ioc = q_to_ioc(bdev_get_queue(bdev)); ioc = q_to_ioc(bdev_get_queue(bdev));

View File

@@ -292,7 +292,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
if (use_delay) if (use_delay)
blkcg_schedule_throttle(rqos->q, use_memdelay); blkcg_schedule_throttle(rqos->q->disk, use_memdelay);
/* /*
* To avoid priority inversions we want to just take a slot if we are * To avoid priority inversions we want to just take a slot if we are
@@ -756,8 +756,9 @@ static void blkiolatency_enable_work_fn(struct work_struct *work)
} }
} }
int blk_iolatency_init(struct request_queue *q) int blk_iolatency_init(struct gendisk *disk)
{ {
struct request_queue *q = disk->queue;
struct blk_iolatency *blkiolat; struct blk_iolatency *blkiolat;
struct rq_qos *rqos; struct rq_qos *rqos;
int ret; int ret;

View File

@@ -202,14 +202,14 @@ void blkcg_set_ioprio(struct bio *bio)
bio->bi_ioprio = prio; bio->bi_ioprio = prio;
} }
void blk_ioprio_exit(struct request_queue *q) void blk_ioprio_exit(struct gendisk *disk)
{ {
blkcg_deactivate_policy(q, &ioprio_policy); blkcg_deactivate_policy(disk->queue, &ioprio_policy);
} }
int blk_ioprio_init(struct request_queue *q) int blk_ioprio_init(struct gendisk *disk)
{ {
return blkcg_activate_policy(q, &ioprio_policy); return blkcg_activate_policy(disk->queue, &ioprio_policy);
} }
static int __init ioprio_init(void) static int __init ioprio_init(void)

View File

@@ -9,15 +9,15 @@ struct request_queue;
struct bio; struct bio;
#ifdef CONFIG_BLK_CGROUP_IOPRIO #ifdef CONFIG_BLK_CGROUP_IOPRIO
int blk_ioprio_init(struct request_queue *q); int blk_ioprio_init(struct gendisk *disk);
void blk_ioprio_exit(struct request_queue *q); void blk_ioprio_exit(struct gendisk *disk);
void blkcg_set_ioprio(struct bio *bio); void blkcg_set_ioprio(struct bio *bio);
#else #else
static inline int blk_ioprio_init(struct request_queue *q) static inline int blk_ioprio_init(struct gendisk *disk)
{ {
return 0; return 0;
} }
static inline void blk_ioprio_exit(struct request_queue *q) static inline void blk_ioprio_exit(struct gendisk *disk)
{ {
} }
static inline void blkcg_set_ioprio(struct bio *bio) static inline void blkcg_set_ioprio(struct bio *bio)

View File

@@ -158,7 +158,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq));
if (map_data) { if (map_data) {
nr_pages = 1 << map_data->page_order; nr_pages = 1U << map_data->page_order;
i = map_data->offset / PAGE_SIZE; i = map_data->offset / PAGE_SIZE;
} }
while (len) { while (len) {
@@ -231,6 +231,16 @@ out_bmd:
return ret; return ret;
} }
static void bio_map_put(struct bio *bio)
{
if (bio->bi_opf & REQ_ALLOC_CACHE) {
bio_put(bio);
} else {
bio_uninit(bio);
kfree(bio);
}
}
static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
gfp_t gfp_mask) gfp_t gfp_mask)
{ {
@@ -243,18 +253,34 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
if (!iov_iter_count(iter)) if (!iov_iter_count(iter))
return -EINVAL; return -EINVAL;
bio = bio_kmalloc(nr_vecs, gfp_mask); if (rq->cmd_flags & REQ_POLLED) {
if (!bio) blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE;
return -ENOMEM;
bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask,
&fs_bio_set);
if (!bio)
return -ENOMEM;
} else {
bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return -ENOMEM;
bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq));
}
while (iov_iter_count(iter)) { while (iov_iter_count(iter)) {
struct page **pages; struct page **pages, *stack_pages[UIO_FASTIOV];
ssize_t bytes; ssize_t bytes;
size_t offs, added = 0; size_t offs;
int npages; int npages;
bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs); if (nr_vecs <= ARRAY_SIZE(stack_pages)) {
pages = stack_pages;
bytes = iov_iter_get_pages2(iter, pages, LONG_MAX,
nr_vecs, &offs);
} else {
bytes = iov_iter_get_pages_alloc2(iter, &pages,
LONG_MAX, &offs);
}
if (unlikely(bytes <= 0)) { if (unlikely(bytes <= 0)) {
ret = bytes ? bytes : -EFAULT; ret = bytes ? bytes : -EFAULT;
goto out_unmap; goto out_unmap;
@@ -280,7 +306,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
break; break;
} }
added += n;
bytes -= n; bytes -= n;
offs = 0; offs = 0;
} }
@@ -290,7 +315,8 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
*/ */
while (j < npages) while (j < npages)
put_page(pages[j++]); put_page(pages[j++]);
kvfree(pages); if (pages != stack_pages)
kvfree(pages);
/* couldn't stuff something into bio? */ /* couldn't stuff something into bio? */
if (bytes) { if (bytes) {
iov_iter_revert(iter, bytes); iov_iter_revert(iter, bytes);
@@ -305,8 +331,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
out_unmap: out_unmap:
bio_release_pages(bio, false); bio_release_pages(bio, false);
bio_uninit(bio); bio_map_put(bio);
kfree(bio);
return ret; return ret;
} }
@@ -611,8 +636,7 @@ int blk_rq_unmap_user(struct bio *bio)
next_bio = bio; next_bio = bio;
bio = bio->bi_next; bio = bio->bi_next;
bio_uninit(next_bio); bio_map_put(next_bio);
kfree(next_bio);
} }
return ret; return ret;

View File

@@ -32,7 +32,7 @@ static int get_first_sibling(unsigned int cpu)
return cpu; return cpu;
} }
int blk_mq_map_queues(struct blk_mq_queue_map *qmap) void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
{ {
unsigned int *map = qmap->mq_map; unsigned int *map = qmap->mq_map;
unsigned int nr_queues = qmap->nr_queues; unsigned int nr_queues = qmap->nr_queues;
@@ -70,8 +70,6 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
map[cpu] = map[first_sibling]; map[cpu] = map[first_sibling];
} }
} }
return 0;
} }
EXPORT_SYMBOL_GPL(blk_mq_map_queues); EXPORT_SYMBOL_GPL(blk_mq_map_queues);

View File

@@ -807,8 +807,6 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
return "latency"; return "latency";
case RQ_QOS_COST: case RQ_QOS_COST:
return "cost"; return "cost";
case RQ_QOS_IOPRIO:
return "ioprio";
} }
return "unknown"; return "unknown";
} }

View File

@@ -23,8 +23,8 @@
* that maps a queue to the CPUs that have irq affinity for the corresponding * that maps a queue to the CPUs that have irq affinity for the corresponding
* vector. * vector.
*/ */
int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
int offset) int offset)
{ {
const struct cpumask *mask; const struct cpumask *mask;
unsigned int queue, cpu; unsigned int queue, cpu;
@@ -38,11 +38,10 @@ int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
qmap->mq_map[cpu] = qmap->queue_offset + queue; qmap->mq_map[cpu] = qmap->queue_offset + queue;
} }
return 0; return;
fallback: fallback:
WARN_ON_ONCE(qmap->nr_queues > 1); WARN_ON_ONCE(qmap->nr_queues > 1);
blk_mq_clear_mq_map(qmap); blk_mq_clear_mq_map(qmap);
return 0;
} }
EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);

View File

@@ -21,7 +21,7 @@
* @set->nr_hw_queues, or @dev does not provide an affinity mask for a * @set->nr_hw_queues, or @dev does not provide an affinity mask for a
* vector, we fallback to the naive mapping. * vector, we fallback to the naive mapping.
*/ */
int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
struct ib_device *dev, int first_vec) struct ib_device *dev, int first_vec)
{ {
const struct cpumask *mask; const struct cpumask *mask;
@@ -36,9 +36,9 @@ int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
map->mq_map[cpu] = map->queue_offset + queue; map->mq_map[cpu] = map->queue_offset + queue;
} }
return 0; return;
fallback: fallback:
return blk_mq_map_queues(map); blk_mq_map_queues(map);
} }
EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);

View File

@@ -196,7 +196,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
* other allocations on previous queue won't be starved. * other allocations on previous queue won't be starved.
*/ */
if (bt != bt_prev) if (bt != bt_prev)
sbitmap_queue_wake_up(bt_prev); sbitmap_queue_wake_up(bt_prev, 1);
ws = bt_wait_ptr(bt, data->hctx); ws = bt_wait_ptr(bt, data->hctx);
} while (1); } while (1);

View File

@@ -21,7 +21,7 @@
* that maps a queue to the CPUs that have irq affinity for the corresponding * that maps a queue to the CPUs that have irq affinity for the corresponding
* vector. * vector.
*/ */
int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
struct virtio_device *vdev, int first_vec) struct virtio_device *vdev, int first_vec)
{ {
const struct cpumask *mask; const struct cpumask *mask;
@@ -39,8 +39,9 @@ int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
qmap->mq_map[cpu] = qmap->queue_offset + queue; qmap->mq_map[cpu] = qmap->queue_offset + queue;
} }
return 0; return;
fallback: fallback:
return blk_mq_map_queues(qmap); blk_mq_map_queues(qmap);
} }
EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);

View File

@@ -1093,10 +1093,12 @@ bool blk_mq_complete_request_remote(struct request *rq)
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
/* /*
* For a polled request, always complete locally, it's pointless * For request which hctx has only one ctx mapping,
* to redirect the completion. * or a polled request, always complete locally,
* it's pointless to redirect the completion.
*/ */
if (rq->cmd_flags & REQ_POLLED) if (rq->mq_hctx->nr_ctx == 1 ||
rq->cmd_flags & REQ_POLLED)
return false; return false;
if (blk_mq_complete_need_ipi(rq)) { if (blk_mq_complete_need_ipi(rq)) {
@@ -1213,6 +1215,12 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
WARN_ON(!blk_rq_is_passthrough(rq)); WARN_ON(!blk_rq_is_passthrough(rq));
blk_account_io_start(rq); blk_account_io_start(rq);
/*
* As plugging can be enabled for passthrough requests on a zoned
* device, directly accessing the plug instead of using blk_mq_plug()
* should not have any consequences.
*/
if (current->plug) if (current->plug)
blk_add_rq_to_plug(current->plug, rq); blk_add_rq_to_plug(current->plug, rq);
else else
@@ -1993,7 +2001,7 @@ out:
if (!needs_restart || if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) (no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true); blk_mq_run_hw_queue(hctx, true);
else if (needs_restart && needs_resource) else if (needs_resource)
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
blk_mq_update_dispatch_busy(hctx, true); blk_mq_update_dispatch_busy(hctx, true);
@@ -4192,7 +4200,7 @@ static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
return 0; return 0;
} }
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{ {
/* /*
* blk_mq_map_queues() and multiple .map_queues() implementations * blk_mq_map_queues() and multiple .map_queues() implementations
@@ -4222,10 +4230,10 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
for (i = 0; i < set->nr_maps; i++) for (i = 0; i < set->nr_maps; i++)
blk_mq_clear_mq_map(&set->map[i]); blk_mq_clear_mq_map(&set->map[i]);
return set->ops->map_queues(set); set->ops->map_queues(set);
} else { } else {
BUG_ON(set->nr_maps > 1); BUG_ON(set->nr_maps > 1);
return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
} }
} }
@@ -4324,9 +4332,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
} }
ret = blk_mq_update_queue_map(set); blk_mq_update_queue_map(set);
if (ret)
goto out_free_mq_map;
ret = blk_mq_alloc_set_map_and_rqs(set); ret = blk_mq_alloc_set_map_and_rqs(set);
if (ret) if (ret)
@@ -4474,14 +4480,14 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
list_add(&qe->node, head); list_add(&qe->node, head);
/* /*
* After elevator_switch_mq, the previous elevator_queue will be * After elevator_switch, the previous elevator_queue will be
* released by elevator_release. The reference of the io scheduler * released by elevator_release. The reference of the io scheduler
* module get by elevator_get will also be put. So we need to get * module get by elevator_get will also be put. So we need to get
* a reference of the io scheduler module here to prevent it to be * a reference of the io scheduler module here to prevent it to be
* removed. * removed.
*/ */
__module_get(qe->type->elevator_owner); __module_get(qe->type->elevator_owner);
elevator_switch_mq(q, NULL); elevator_switch(q, NULL);
mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_lock);
return true; return true;
@@ -4513,7 +4519,7 @@ static void blk_mq_elv_switch_back(struct list_head *head,
kfree(qe); kfree(qe);
mutex_lock(&q->sysfs_lock); mutex_lock(&q->sysfs_lock);
elevator_switch_mq(q, t); elevator_switch(q, t);
mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_lock);
} }

View File

@@ -312,7 +312,8 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
static inline struct blk_plug *blk_mq_plug( struct bio *bio) static inline struct blk_plug *blk_mq_plug( struct bio *bio)
{ {
/* Zoned block device write operation case: do not plug the BIO */ /* Zoned block device write operation case: do not plug the BIO */
if (bdev_is_zoned(bio->bi_bdev) && op_is_write(bio_op(bio))) if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio)))
return NULL; return NULL;
/* /*

View File

@@ -17,7 +17,6 @@ enum rq_qos_id {
RQ_QOS_WBT, RQ_QOS_WBT,
RQ_QOS_LATENCY, RQ_QOS_LATENCY,
RQ_QOS_COST, RQ_QOS_COST,
RQ_QOS_IOPRIO,
}; };
struct rq_wait { struct rq_wait {

View File

@@ -844,7 +844,7 @@ int blk_register_queue(struct gendisk *disk)
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q); wbt_enable_default(q);
blk_throtl_register_queue(q); blk_throtl_register(disk);
/* Now everything is ready and send out KOBJ_ADD uevent */ /* Now everything is ready and send out KOBJ_ADD uevent */
kobject_uevent(&q->kobj, KOBJ_ADD); kobject_uevent(&q->kobj, KOBJ_ADD);

View File

@@ -329,8 +329,8 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
/* init a service_queue, assumes the caller zeroed it */ /* init a service_queue, assumes the caller zeroed it */
static void throtl_service_queue_init(struct throtl_service_queue *sq) static void throtl_service_queue_init(struct throtl_service_queue *sq)
{ {
INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[READ]);
INIT_LIST_HEAD(&sq->queued[1]); INIT_LIST_HEAD(&sq->queued[WRITE]);
sq->pending_tree = RB_ROOT_CACHED; sq->pending_tree = RB_ROOT_CACHED;
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
} }
@@ -420,24 +420,17 @@ static void tg_update_has_rules(struct throtl_grp *tg)
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
struct throtl_data *td = tg->td; struct throtl_data *td = tg->td;
int rw; int rw;
int has_iops_limit = 0;
for (rw = READ; rw <= WRITE; rw++) { for (rw = READ; rw <= WRITE; rw++) {
unsigned int iops_limit = tg_iops_limit(tg, rw); tg->has_rules_iops[rw] =
(parent_tg && parent_tg->has_rules_iops[rw]) ||
tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
(td->limit_valid[td->limit_index] && (td->limit_valid[td->limit_index] &&
(tg_bps_limit(tg, rw) != U64_MAX || tg_iops_limit(tg, rw) != UINT_MAX);
iops_limit != UINT_MAX)); tg->has_rules_bps[rw] =
(parent_tg && parent_tg->has_rules_bps[rw]) ||
if (iops_limit != UINT_MAX) (td->limit_valid[td->limit_index] &&
has_iops_limit = 1; (tg_bps_limit(tg, rw) != U64_MAX));
} }
if (has_iops_limit)
tg->flags |= THROTL_TG_HAS_IOPS_LIMIT;
else
tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT;
} }
static void throtl_pd_online(struct blkg_policy_data *pd) static void throtl_pd_online(struct blkg_policy_data *pd)
@@ -520,7 +513,6 @@ static void throtl_rb_erase(struct rb_node *n,
{ {
rb_erase_cached(n, &parent_sq->pending_tree); rb_erase_cached(n, &parent_sq->pending_tree);
RB_CLEAR_NODE(n); RB_CLEAR_NODE(n);
--parent_sq->nr_pending;
} }
static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
@@ -572,7 +564,11 @@ static void throtl_enqueue_tg(struct throtl_grp *tg)
static void throtl_dequeue_tg(struct throtl_grp *tg) static void throtl_dequeue_tg(struct throtl_grp *tg)
{ {
if (tg->flags & THROTL_TG_PENDING) { if (tg->flags & THROTL_TG_PENDING) {
throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); struct throtl_service_queue *parent_sq =
tg->service_queue.parent_sq;
throtl_rb_erase(&tg->rb_node, parent_sq);
--parent_sq->nr_pending;
tg->flags &= ~THROTL_TG_PENDING; tg->flags &= ~THROTL_TG_PENDING;
} }
} }
@@ -639,6 +635,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
{ {
tg->bytes_disp[rw] = 0; tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0; tg->io_disp[rw] = 0;
tg->carryover_bytes[rw] = 0;
tg->carryover_ios[rw] = 0;
/* /*
* Previous slice has expired. We must have trimmed it after last * Previous slice has expired. We must have trimmed it after last
@@ -656,12 +654,17 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
tg->slice_end[rw], jiffies); tg->slice_end[rw], jiffies);
} }
static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw,
bool clear_carryover)
{ {
tg->bytes_disp[rw] = 0; tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0; tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies; tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice; tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
if (clear_carryover) {
tg->carryover_bytes[rw] = 0;
tg->carryover_ios[rw] = 0;
}
throtl_log(&tg->service_queue, throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu", "[%c] new slice start=%lu end=%lu jiffies=%lu",
@@ -754,13 +757,76 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
tg->slice_start[rw], tg->slice_end[rw], jiffies); tg->slice_start[rw], tg->slice_end[rw], jiffies);
} }
static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, static unsigned int calculate_io_allowed(u32 iops_limit,
u32 iops_limit, unsigned long *wait) unsigned long jiffy_elapsed)
{
unsigned int io_allowed;
u64 tmp;
/*
* jiffy_elapsed should not be a big value as minimum iops can be
* 1 then at max jiffy elapsed should be equivalent of 1 second as we
* will allow dispatch after 1 second and after that slice should
* have been trimmed.
*/
tmp = (u64)iops_limit * jiffy_elapsed;
do_div(tmp, HZ);
if (tmp > UINT_MAX)
io_allowed = UINT_MAX;
else
io_allowed = tmp;
return io_allowed;
}
static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
{
return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
}
static void __tg_update_carryover(struct throtl_grp *tg, bool rw)
{
unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
u64 bps_limit = tg_bps_limit(tg, rw);
u32 iops_limit = tg_iops_limit(tg, rw);
/*
* If config is updated while bios are still throttled, calculate and
* accumulate how many bytes/ios are waited across changes. And
* carryover_bytes/ios will be used to calculate new wait time under new
* configuration.
*/
if (bps_limit != U64_MAX)
tg->carryover_bytes[rw] +=
calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
tg->bytes_disp[rw];
if (iops_limit != UINT_MAX)
tg->carryover_ios[rw] +=
calculate_io_allowed(iops_limit, jiffy_elapsed) -
tg->io_disp[rw];
}
static void tg_update_carryover(struct throtl_grp *tg)
{
if (tg->service_queue.nr_queued[READ])
__tg_update_carryover(tg, READ);
if (tg->service_queue.nr_queued[WRITE])
__tg_update_carryover(tg, WRITE);
/* see comments in struct throtl_grp for meaning of these fields. */
throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__,
tg->carryover_bytes[READ], tg->carryover_bytes[WRITE],
tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
}
static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
u32 iops_limit, unsigned long *wait)
{ {
bool rw = bio_data_dir(bio); bool rw = bio_data_dir(bio);
unsigned int io_allowed; unsigned int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
if (iops_limit == UINT_MAX) { if (iops_limit == UINT_MAX) {
if (wait) if (wait)
@@ -772,22 +838,8 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
/* Round up to the next throttle slice, wait time must be nonzero */ /* Round up to the next throttle slice, wait time must be nonzero */
jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
/* tg->carryover_ios[rw];
* jiffy_elapsed_rnd should not be a big value as minimum iops can be
* 1 then at max jiffy elapsed should be equivalent of 1 second as we
* will allow dispatch after 1 second and after that slice should
* have been trimmed.
*/
tmp = (u64)iops_limit * jiffy_elapsed_rnd;
do_div(tmp, HZ);
if (tmp > UINT_MAX)
io_allowed = UINT_MAX;
else
io_allowed = tmp;
if (tg->io_disp[rw] + 1 <= io_allowed) { if (tg->io_disp[rw] + 1 <= io_allowed) {
if (wait) if (wait)
*wait = 0; *wait = 0;
@@ -802,16 +854,16 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
return false; return false;
} }
static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
u64 bps_limit, unsigned long *wait) u64 bps_limit, unsigned long *wait)
{ {
bool rw = bio_data_dir(bio); bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes, tmp; u64 bytes_allowed, extra_bytes;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio); unsigned int bio_size = throtl_bio_data_size(bio);
/* no need to throttle if this bio's bytes have been accounted */ /* no need to throttle if this bio's bytes have been accounted */
if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) { if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
if (wait) if (wait)
*wait = 0; *wait = 0;
return true; return true;
@@ -824,11 +876,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
tmp = bps_limit * jiffy_elapsed_rnd; tg->carryover_bytes[rw];
do_div(tmp, HZ);
bytes_allowed = tmp;
if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
if (wait) if (wait)
*wait = 0; *wait = 0;
@@ -889,7 +938,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
* slice and it should be extended instead. * slice and it should be extended instead.
*/ */
if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
throtl_start_new_slice(tg, rw); throtl_start_new_slice(tg, rw, true);
else { else {
if (time_before(tg->slice_end[rw], if (time_before(tg->slice_end[rw],
jiffies + tg->td->throtl_slice)) jiffies + tg->td->throtl_slice))
@@ -897,8 +946,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice); jiffies + tg->td->throtl_slice);
} }
if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) &&
tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (wait) if (wait)
*wait = 0; *wait = 0;
return true; return true;
@@ -921,22 +970,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
unsigned int bio_size = throtl_bio_data_size(bio); unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */ /* Charge the bio to the group */
if (!bio_flagged(bio, BIO_THROTTLED)) { if (!bio_flagged(bio, BIO_BPS_THROTTLED)) {
tg->bytes_disp[rw] += bio_size; tg->bytes_disp[rw] += bio_size;
tg->last_bytes_disp[rw] += bio_size; tg->last_bytes_disp[rw] += bio_size;
} }
tg->io_disp[rw]++; tg->io_disp[rw]++;
tg->last_io_disp[rw]++; tg->last_io_disp[rw]++;
/*
* BIO_THROTTLED is used to prevent the same bio to be throttled
* more than once as a throttled bio will go through blk-throtl the
* second time when it eventually gets issued. Set it when a bio
* is being charged to a tg.
*/
if (!bio_flagged(bio, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
} }
/** /**
@@ -990,9 +1030,9 @@ static void tg_update_disptime(struct throtl_grp *tg)
disptime = jiffies + min_wait; disptime = jiffies + min_wait;
/* Update dispatch time */ /* Update dispatch time */
throtl_dequeue_tg(tg); throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
tg->disptime = disptime; tg->disptime = disptime;
throtl_enqueue_tg(tg); tg_service_queue_add(tg);
/* see throtl_add_bio_tg() */ /* see throtl_add_bio_tg() */
tg->flags &= ~THROTL_TG_WAS_EMPTY; tg->flags &= ~THROTL_TG_WAS_EMPTY;
@@ -1026,6 +1066,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
sq->nr_queued[rw]--; sq->nr_queued[rw]--;
throtl_charge_bio(tg, bio); throtl_charge_bio(tg, bio);
bio_set_flag(bio, BIO_BPS_THROTTLED);
/* /*
* If our parent is another tg, we just need to transfer @bio to * If our parent is another tg, we just need to transfer @bio to
@@ -1101,13 +1142,13 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
if (time_before(jiffies, tg->disptime)) if (time_before(jiffies, tg->disptime))
break; break;
throtl_dequeue_tg(tg);
nr_disp += throtl_dispatch_tg(tg); nr_disp += throtl_dispatch_tg(tg);
sq = &tg->service_queue; sq = &tg->service_queue;
if (sq->nr_queued[0] || sq->nr_queued[1]) if (sq->nr_queued[READ] || sq->nr_queued[WRITE])
tg_update_disptime(tg); tg_update_disptime(tg);
else
throtl_dequeue_tg(tg);
if (nr_disp >= THROTL_QUANTUM) if (nr_disp >= THROTL_QUANTUM)
break; break;
@@ -1321,8 +1362,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
* that a group's limit are dropped suddenly and we don't want to * that a group's limit are dropped suddenly and we don't want to
* account recently dispatched IO with new low rate. * account recently dispatched IO with new low rate.
*/ */
throtl_start_new_slice(tg, READ); throtl_start_new_slice(tg, READ, false);
throtl_start_new_slice(tg, WRITE); throtl_start_new_slice(tg, WRITE, false);
if (tg->flags & THROTL_TG_PENDING) { if (tg->flags & THROTL_TG_PENDING) {
tg_update_disptime(tg); tg_update_disptime(tg);
@@ -1350,6 +1391,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
v = U64_MAX; v = U64_MAX;
tg = blkg_to_tg(ctx.blkg); tg = blkg_to_tg(ctx.blkg);
tg_update_carryover(tg);
if (is_u64) if (is_u64)
*(u64 *)((void *)tg + of_cft(of)->private) = v; *(u64 *)((void *)tg + of_cft(of)->private) = v;
@@ -1536,6 +1578,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
return ret; return ret;
tg = blkg_to_tg(ctx.blkg); tg = blkg_to_tg(ctx.blkg);
tg_update_carryover(tg);
v[0] = tg->bps_conf[READ][index]; v[0] = tg->bps_conf[READ][index];
v[1] = tg->bps_conf[WRITE][index]; v[1] = tg->bps_conf[WRITE][index];
@@ -1673,6 +1716,41 @@ struct blkcg_policy blkcg_policy_throtl = {
.pd_free_fn = throtl_pd_free, .pd_free_fn = throtl_pd_free,
}; };
void blk_throtl_cancel_bios(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct cgroup_subsys_state *pos_css;
struct blkcg_gq *blkg;
spin_lock_irq(&q->queue_lock);
/*
* queue_lock is held, rcu lock is not needed here technically.
* However, rcu lock is still held to emphasize that following
* path need RCU protection and to prevent warning from lockdep.
*/
rcu_read_lock();
blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_service_queue *sq = &tg->service_queue;
/*
* Set the flag to make sure throtl_pending_timer_fn() won't
* stop until all throttled bios are dispatched.
*/
blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
/*
* Update disptime after setting the above flag to make sure
* throtl_select_dispatch() won't exit without dispatching.
*/
tg_update_disptime(tg);
throtl_schedule_pending_timer(sq, jiffies + 1);
}
rcu_read_unlock();
spin_unlock_irq(&q->queue_lock);
}
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
{ {
unsigned long rtime = jiffies, wtime = jiffies; unsigned long rtime = jiffies, wtime = jiffies;
@@ -1777,39 +1855,6 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
return false; return false;
} }
void blk_throtl_cancel_bios(struct request_queue *q)
{
struct cgroup_subsys_state *pos_css;
struct blkcg_gq *blkg;
spin_lock_irq(&q->queue_lock);
/*
* queue_lock is held, rcu lock is not needed here technically.
* However, rcu lock is still held to emphasize that following
* path need RCU protection and to prevent warning from lockdep.
*/
rcu_read_lock();
blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_service_queue *sq = &tg->service_queue;
/*
* Set the flag to make sure throtl_pending_timer_fn() won't
* stop until all throttled bios are dispatched.
*/
blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
/*
* Update disptime after setting the above flag to make sure
* throtl_select_dispatch() won't exit without dispatching.
*/
tg_update_disptime(tg);
throtl_schedule_pending_timer(sq, jiffies + 1);
}
rcu_read_unlock();
spin_unlock_irq(&q->queue_lock);
}
static bool throtl_can_upgrade(struct throtl_data *td, static bool throtl_can_upgrade(struct throtl_data *td,
struct throtl_grp *this_tg) struct throtl_grp *this_tg)
{ {
@@ -2005,7 +2050,6 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
tg->checked_last_finish_time = last_finish_time; tg->checked_last_finish_time = last_finish_time;
} }
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_update_latency_buckets(struct throtl_data *td) static void throtl_update_latency_buckets(struct throtl_data *td)
{ {
struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
@@ -2086,6 +2130,28 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
static inline void throtl_update_latency_buckets(struct throtl_data *td) static inline void throtl_update_latency_buckets(struct throtl_data *td)
{ {
} }
static void blk_throtl_update_idletime(struct throtl_grp *tg)
{
}
static void throtl_downgrade_check(struct throtl_grp *tg)
{
}
static void throtl_upgrade_check(struct throtl_grp *tg)
{
}
static bool throtl_can_upgrade(struct throtl_data *td,
struct throtl_grp *this_tg)
{
return false;
}
static void throtl_upgrade_state(struct throtl_data *td)
{
}
#endif #endif
bool __blk_throtl_bio(struct bio *bio) bool __blk_throtl_bio(struct bio *bio)
@@ -2159,8 +2225,10 @@ again:
qn = &tg->qnode_on_parent[rw]; qn = &tg->qnode_on_parent[rw];
sq = sq->parent_sq; sq = sq->parent_sq;
tg = sq_to_tg(sq); tg = sq_to_tg(sq);
if (!tg) if (!tg) {
bio_set_flag(bio, BIO_BPS_THROTTLED);
goto out_unlock; goto out_unlock;
}
} }
/* out-of-limit, queue to @tg */ /* out-of-limit, queue to @tg */
@@ -2189,8 +2257,6 @@ again:
} }
out_unlock: out_unlock:
bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
if (throttled || !td->track_bio_latency) if (throttled || !td->track_bio_latency)
bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
@@ -2286,8 +2352,9 @@ void blk_throtl_bio_endio(struct bio *bio)
} }
#endif #endif
int blk_throtl_init(struct request_queue *q) int blk_throtl_init(struct gendisk *disk)
{ {
struct request_queue *q = disk->queue;
struct throtl_data *td; struct throtl_data *td;
int ret; int ret;
@@ -2329,8 +2396,10 @@ int blk_throtl_init(struct request_queue *q)
return ret; return ret;
} }
void blk_throtl_exit(struct request_queue *q) void blk_throtl_exit(struct gendisk *disk)
{ {
struct request_queue *q = disk->queue;
BUG_ON(!q->td); BUG_ON(!q->td);
del_timer_sync(&q->td->service_queue.pending_timer); del_timer_sync(&q->td->service_queue.pending_timer);
throtl_shutdown_wq(q); throtl_shutdown_wq(q);
@@ -2340,8 +2409,9 @@ void blk_throtl_exit(struct request_queue *q)
kfree(q->td); kfree(q->td);
} }
void blk_throtl_register_queue(struct request_queue *q) void blk_throtl_register(struct gendisk *disk)
{ {
struct request_queue *q = disk->queue;
struct throtl_data *td; struct throtl_data *td;
int i; int i;

View File

@@ -55,8 +55,7 @@ struct throtl_service_queue {
enum tg_state_flags { enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */ THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
}; };
enum { enum {
@@ -99,7 +98,8 @@ struct throtl_grp {
unsigned int flags; unsigned int flags;
/* are there any throtl rules between this group and td? */ /* are there any throtl rules between this group and td? */
bool has_rules[2]; bool has_rules_bps[2];
bool has_rules_iops[2];
/* internally used bytes per second rate limits */ /* internally used bytes per second rate limits */
uint64_t bps[2][LIMIT_CNT]; uint64_t bps[2][LIMIT_CNT];
@@ -121,6 +121,15 @@ struct throtl_grp {
uint64_t last_bytes_disp[2]; uint64_t last_bytes_disp[2];
unsigned int last_io_disp[2]; unsigned int last_io_disp[2];
/*
* The following two fields are updated when new configuration is
* submitted while some bios are still throttled, they record how many
* bytes/ios are waited already in previous configuration, and they will
* be used to calculate wait time under new configuration.
*/
uint64_t carryover_bytes[2];
unsigned int carryover_ios[2];
unsigned long last_check_time; unsigned long last_check_time;
unsigned long latency_target; /* us */ unsigned long latency_target; /* us */
@@ -159,27 +168,37 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
* Internal throttling interface * Internal throttling interface
*/ */
#ifndef CONFIG_BLK_DEV_THROTTLING #ifndef CONFIG_BLK_DEV_THROTTLING
static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline int blk_throtl_init(struct gendisk *disk) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { } static inline void blk_throtl_exit(struct gendisk *disk) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { } static inline void blk_throtl_register(struct gendisk *disk) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; } static inline bool blk_throtl_bio(struct bio *bio) { return false; }
static inline void blk_throtl_cancel_bios(struct request_queue *q) { } static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
#else /* CONFIG_BLK_DEV_THROTTLING */ #else /* CONFIG_BLK_DEV_THROTTLING */
int blk_throtl_init(struct request_queue *q); int blk_throtl_init(struct gendisk *disk);
void blk_throtl_exit(struct request_queue *q); void blk_throtl_exit(struct gendisk *disk);
void blk_throtl_register_queue(struct request_queue *q); void blk_throtl_register(struct gendisk *disk);
bool __blk_throtl_bio(struct bio *bio); bool __blk_throtl_bio(struct bio *bio);
void blk_throtl_cancel_bios(struct request_queue *q); void blk_throtl_cancel_bios(struct gendisk *disk);
static inline bool blk_throtl_bio(struct bio *bio)
static inline bool blk_should_throtl(struct bio *bio)
{ {
struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
int rw = bio_data_dir(bio);
/* no need to throttle bps any more if the bio has been throttled */ /* iops limit is always counted */
if (bio_flagged(bio, BIO_THROTTLED) && if (tg->has_rules_iops[rw])
!(tg->flags & THROTL_TG_HAS_IOPS_LIMIT)) return true;
return false;
if (!tg->has_rules[bio_data_dir(bio)]) if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED))
return true;
return false;
}
static inline bool blk_throtl_bio(struct bio *bio)
{
if (!blk_should_throtl(bio))
return false; return false;
return __blk_throtl_bio(bio); return __blk_throtl_bio(bio);

View File

@@ -843,6 +843,10 @@ int wbt_init(struct request_queue *q)
rwb->enable_state = WBT_STATE_ON_DEFAULT; rwb->enable_state = WBT_STATE_ON_DEFAULT;
rwb->wc = 1; rwb->wc = 1;
rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
wbt_queue_depth_changed(&rwb->rqos);
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
/* /*
* Assign rwb and add the stats callback. * Assign rwb and add the stats callback.
@@ -853,11 +857,6 @@ int wbt_init(struct request_queue *q)
blk_stat_add_callback(q, rwb->cb); blk_stat_add_callback(q, rwb->cb);
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
wbt_queue_depth_changed(&rwb->rqos);
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
return 0; return 0;
err_free: err_free:

View File

@@ -63,13 +63,10 @@ bool blk_req_needs_zone_write_lock(struct request *rq)
if (!rq->q->disk->seq_zones_wlock) if (!rq->q->disk->seq_zones_wlock)
return false; return false;
switch (req_op(rq)) { if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq)))
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE:
return blk_rq_zone_is_seq(rq); return blk_rq_zone_is_seq(rq);
default:
return false; return false;
}
} }
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);

View File

@@ -270,8 +270,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
void blk_insert_flush(struct request *rq); void blk_insert_flush(struct request *rq);
int elevator_switch_mq(struct request_queue *q, int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
struct elevator_type *new_e);
void elevator_exit(struct request_queue *q); void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent); int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q); void elv_unregister_queue(struct request_queue *q);
@@ -389,9 +388,9 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
} }
#ifdef CONFIG_BLK_CGROUP_IOLATENCY #ifdef CONFIG_BLK_CGROUP_IOLATENCY
extern int blk_iolatency_init(struct request_queue *q); int blk_iolatency_init(struct gendisk *disk);
#else #else
static inline int blk_iolatency_init(struct request_queue *q) { return 0; } static inline int blk_iolatency_init(struct gendisk *disk) { return 0; };
#endif #endif
#ifdef CONFIG_BLK_DEV_ZONED #ifdef CONFIG_BLK_DEV_ZONED

View File

@@ -588,7 +588,7 @@ void elv_unregister(struct elevator_type *e)
} }
EXPORT_SYMBOL_GPL(elv_unregister); EXPORT_SYMBOL_GPL(elv_unregister);
int elevator_switch_mq(struct request_queue *q, static int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e) struct elevator_type *new_e)
{ {
int ret; int ret;
@@ -723,7 +723,7 @@ void elevator_init_mq(struct request_queue *q)
* need for the new one. this way we have a chance of going back to the old * need for the new one. this way we have a chance of going back to the old
* one, if the new one fails init for some reason. * one, if the new one fails init for some reason.
*/ */
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{ {
int err; int err;

View File

@@ -627,7 +627,7 @@ void del_gendisk(struct gendisk *disk)
blk_mq_freeze_queue_wait(q); blk_mq_freeze_queue_wait(q);
blk_throtl_cancel_bios(disk->queue); blk_throtl_cancel_bios(disk);
blk_sync_queue(q); blk_sync_queue(q);
blk_flush_integrity(); blk_flush_integrity();
@@ -1151,7 +1151,8 @@ static void disk_release(struct device *dev)
!test_bit(GD_ADDED, &disk->state)) !test_bit(GD_ADDED, &disk->state))
blk_mq_exit_queue(disk->queue); blk_mq_exit_queue(disk->queue);
blkcg_exit_queue(disk->queue); blkcg_exit_disk(disk);
bioset_exit(&disk->bio_split); bioset_exit(&disk->bio_split);
disk_release_events(disk); disk_release_events(disk);
@@ -1364,7 +1365,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
goto out_destroy_part_tbl; goto out_destroy_part_tbl;
if (blkcg_init_queue(q)) if (blkcg_init_disk(disk))
goto out_erase_part0; goto out_erase_part0;
rand_initialize_disk(disk); rand_initialize_disk(disk);

View File

@@ -39,7 +39,12 @@ enum opal_response_token {
#define FIRST_TPER_SESSION_NUM 4096 #define FIRST_TPER_SESSION_NUM 4096
#define TPER_SYNC_SUPPORTED 0x01 #define TPER_SYNC_SUPPORTED 0x01
/* FC_LOCKING features */
#define LOCKING_SUPPORTED_MASK 0x01
#define LOCKING_ENABLED_MASK 0x02
#define LOCKED_MASK 0x04
#define MBR_ENABLED_MASK 0x10 #define MBR_ENABLED_MASK 0x10
#define MBR_DONE_MASK 0x20
#define TINY_ATOM_DATA_MASK 0x3F #define TINY_ATOM_DATA_MASK 0x3F
#define TINY_ATOM_SIGNED 0x40 #define TINY_ATOM_SIGNED 0x40

View File

@@ -74,8 +74,7 @@ struct parsed_resp {
}; };
struct opal_dev { struct opal_dev {
bool supported; u32 flags;
bool mbr_enabled;
void *data; void *data;
sec_send_recv *send_recv; sec_send_recv *send_recv;
@@ -280,6 +279,30 @@ static bool check_tper(const void *data)
return true; return true;
} }
static bool check_lcksuppt(const void *data)
{
const struct d0_locking_features *lfeat = data;
u8 sup_feat = lfeat->supported_features;
return !!(sup_feat & LOCKING_SUPPORTED_MASK);
}
static bool check_lckenabled(const void *data)
{
const struct d0_locking_features *lfeat = data;
u8 sup_feat = lfeat->supported_features;
return !!(sup_feat & LOCKING_ENABLED_MASK);
}
static bool check_locked(const void *data)
{
const struct d0_locking_features *lfeat = data;
u8 sup_feat = lfeat->supported_features;
return !!(sup_feat & LOCKED_MASK);
}
static bool check_mbrenabled(const void *data) static bool check_mbrenabled(const void *data)
{ {
const struct d0_locking_features *lfeat = data; const struct d0_locking_features *lfeat = data;
@@ -288,6 +311,14 @@ static bool check_mbrenabled(const void *data)
return !!(sup_feat & MBR_ENABLED_MASK); return !!(sup_feat & MBR_ENABLED_MASK);
} }
static bool check_mbrdone(const void *data)
{
const struct d0_locking_features *lfeat = data;
u8 sup_feat = lfeat->supported_features;
return !!(sup_feat & MBR_DONE_MASK);
}
static bool check_sum(const void *data) static bool check_sum(const void *data)
{ {
const struct d0_single_user_mode *sum = data; const struct d0_single_user_mode *sum = data;
@@ -435,7 +466,7 @@ static int opal_discovery0_end(struct opal_dev *dev)
u32 hlen = be32_to_cpu(hdr->length); u32 hlen = be32_to_cpu(hdr->length);
print_buffer(dev->resp, hlen); print_buffer(dev->resp, hlen);
dev->mbr_enabled = false; dev->flags &= OPAL_FL_SUPPORTED;
if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n",
@@ -461,7 +492,16 @@ static int opal_discovery0_end(struct opal_dev *dev)
check_geometry(dev, body); check_geometry(dev, body);
break; break;
case FC_LOCKING: case FC_LOCKING:
dev->mbr_enabled = check_mbrenabled(body->features); if (check_lcksuppt(body->features))
dev->flags |= OPAL_FL_LOCKING_SUPPORTED;
if (check_lckenabled(body->features))
dev->flags |= OPAL_FL_LOCKING_ENABLED;
if (check_locked(body->features))
dev->flags |= OPAL_FL_LOCKED;
if (check_mbrenabled(body->features))
dev->flags |= OPAL_FL_MBR_ENABLED;
if (check_mbrdone(body->features))
dev->flags |= OPAL_FL_MBR_DONE;
break; break;
case FC_ENTERPRISE: case FC_ENTERPRISE:
case FC_DATASTORE: case FC_DATASTORE:
@@ -2109,7 +2149,8 @@ static int check_opal_support(struct opal_dev *dev)
mutex_lock(&dev->dev_lock); mutex_lock(&dev->dev_lock);
setup_opal_dev(dev); setup_opal_dev(dev);
ret = opal_discovery0_step(dev); ret = opal_discovery0_step(dev);
dev->supported = !ret; if (!ret)
dev->flags |= OPAL_FL_SUPPORTED;
mutex_unlock(&dev->dev_lock); mutex_unlock(&dev->dev_lock);
return ret; return ret;
@@ -2148,6 +2189,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
INIT_LIST_HEAD(&dev->unlk_lst); INIT_LIST_HEAD(&dev->unlk_lst);
mutex_init(&dev->dev_lock); mutex_init(&dev->dev_lock);
dev->flags = 0;
dev->data = data; dev->data = data;
dev->send_recv = send_recv; dev->send_recv = send_recv;
if (check_opal_support(dev) != 0) { if (check_opal_support(dev) != 0) {
@@ -2528,7 +2570,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
if (!dev) if (!dev)
return false; return false;
if (!dev->supported) if (!(dev->flags & OPAL_FL_SUPPORTED))
return false; return false;
mutex_lock(&dev->dev_lock); mutex_lock(&dev->dev_lock);
@@ -2546,7 +2588,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
was_failure = true; was_failure = true;
} }
if (dev->mbr_enabled) { if (dev->flags & OPAL_FL_MBR_ENABLED) {
ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key);
if (ret) if (ret)
pr_debug("Failed to set MBR Done in S3 resume\n"); pr_debug("Failed to set MBR Done in S3 resume\n");
@@ -2620,6 +2662,23 @@ static int opal_generic_read_write_table(struct opal_dev *dev,
return ret; return ret;
} }
static int opal_get_status(struct opal_dev *dev, void __user *data)
{
struct opal_status sts = {0};
/*
* check_opal_support() error is not fatal,
* !dev->supported is a valid condition
*/
if (!check_opal_support(dev))
sts.flags = dev->flags;
if (copy_to_user(data, &sts, sizeof(sts))) {
pr_debug("Error copying status to userspace\n");
return -EFAULT;
}
return 0;
}
int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
{ {
void *p; void *p;
@@ -2629,12 +2688,14 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
return -EACCES; return -EACCES;
if (!dev) if (!dev)
return -ENOTSUPP; return -ENOTSUPP;
if (!dev->supported) if (!(dev->flags & OPAL_FL_SUPPORTED))
return -ENOTSUPP; return -ENOTSUPP;
p = memdup_user(arg, _IOC_SIZE(cmd)); if (cmd & IOC_IN) {
if (IS_ERR(p)) p = memdup_user(arg, _IOC_SIZE(cmd));
return PTR_ERR(p); if (IS_ERR(p))
return PTR_ERR(p);
}
switch (cmd) { switch (cmd) {
case IOC_OPAL_SAVE: case IOC_OPAL_SAVE:
@@ -2685,11 +2746,15 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
case IOC_OPAL_GENERIC_TABLE_RW: case IOC_OPAL_GENERIC_TABLE_RW:
ret = opal_generic_read_write_table(dev, p); ret = opal_generic_read_write_table(dev, p);
break; break;
case IOC_OPAL_GET_STATUS:
ret = opal_get_status(dev, arg);
break;
default: default:
break; break;
} }
kfree(p); if (cmd & IOC_IN)
kfree(p);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(sed_ioctl); EXPORT_SYMBOL_GPL(sed_ioctl);

View File

@@ -108,7 +108,7 @@ static ssize_t aoedisk_show_payload(struct device *dev,
return sysfs_emit(page, "%lu\n", d->maxbcnt); return sysfs_emit(page, "%lu\n", d->maxbcnt);
} }
static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) static int aoe_debugfs_show(struct seq_file *s, void *ignored)
{ {
struct aoedev *d; struct aoedev *d;
struct aoetgt **t, **te; struct aoetgt **t, **te;
@@ -151,11 +151,7 @@ static int aoedisk_debugfs_show(struct seq_file *s, void *ignored)
return 0; return 0;
} }
DEFINE_SHOW_ATTRIBUTE(aoe_debugfs);
static int aoe_debugfs_open(struct inode *inode, struct file *file)
{
return single_open(file, aoedisk_debugfs_show, inode->i_private);
}
static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL); static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL);
static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL); static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL);
@@ -184,13 +180,6 @@ static const struct attribute_group *aoe_attr_groups[] = {
NULL, NULL,
}; };
static const struct file_operations aoe_debugfs_fops = {
.open = aoe_debugfs_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static void static void
aoedisk_add_debugfs(struct aoedev *d) aoedisk_add_debugfs(struct aoedev *d)
{ {

View File

@@ -397,7 +397,7 @@ static int brd_alloc(int i)
disk->minors = max_part; disk->minors = max_part;
disk->fops = &brd_fops; disk->fops = &brd_fops;
disk->private_data = brd; disk->private_data = brd;
strlcpy(disk->disk_name, buf, DISK_NAME_LEN); strscpy(disk->disk_name, buf, DISK_NAME_LEN);
set_capacity(disk, rd_size * 2); set_capacity(disk, rd_size * 2);
/* /*

View File

@@ -1529,7 +1529,6 @@ extern int w_send_read_req(struct drbd_work *, int);
extern int w_e_reissue(struct drbd_work *, int); extern int w_e_reissue(struct drbd_work *, int);
extern int w_restart_disk_io(struct drbd_work *, int); extern int w_restart_disk_io(struct drbd_work *, int);
extern int w_send_out_of_sync(struct drbd_work *, int); extern int w_send_out_of_sync(struct drbd_work *, int);
extern int w_start_resync(struct drbd_work *, int);
extern void resync_timer_fn(struct timer_list *t); extern void resync_timer_fn(struct timer_list *t);
extern void start_resync_timer_fn(struct timer_list *t); extern void start_resync_timer_fn(struct timer_list *t);

View File

@@ -4752,7 +4752,7 @@ void notify_helper(enum drbd_notification_type type,
struct drbd_genlmsghdr *dh; struct drbd_genlmsghdr *dh;
int err; int err;
strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); strscpy(helper_info.helper_name, name, sizeof(helper_info.helper_name));
helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name)); helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name));
helper_info.helper_status = status; helper_info.helper_status = status;

View File

@@ -2113,9 +2113,6 @@ static int receive_DataReply(struct drbd_connection *connection, struct packet_i
if (unlikely(!req)) if (unlikely(!req))
return -EIO; return -EIO;
/* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
* special casing it there for the various failure cases.
* still no race with drbd_fail_pending_reads */
err = recv_dless_read(peer_device, req, sector, pi->size); err = recv_dless_read(peer_device, req, sector, pi->size);
if (!err) if (!err)
req_mod(req, DATA_RECEIVED); req_mod(req, DATA_RECEIVED);

View File

@@ -266,8 +266,6 @@ struct bio_and_error {
extern void start_new_tl_epoch(struct drbd_connection *connection); extern void start_new_tl_epoch(struct drbd_connection *connection);
extern void drbd_req_destroy(struct kref *kref); extern void drbd_req_destroy(struct kref *kref);
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m); struct bio_and_error *m);
extern void complete_master_bio(struct drbd_device *device, extern void complete_master_bio(struct drbd_device *device,

View File

@@ -1397,15 +1397,15 @@ static void mtip_dump_identify(struct mtip_port *port)
if (!port->identify_valid) if (!port->identify_valid)
return; return;
strlcpy(cbuf, (char *)(port->identify+10), 21); strscpy(cbuf, (char *)(port->identify + 10), 21);
dev_info(&port->dd->pdev->dev, dev_info(&port->dd->pdev->dev,
"Serial No.: %s\n", cbuf); "Serial No.: %s\n", cbuf);
strlcpy(cbuf, (char *)(port->identify+23), 9); strscpy(cbuf, (char *)(port->identify + 23), 9);
dev_info(&port->dd->pdev->dev, dev_info(&port->dd->pdev->dev,
"Firmware Ver.: %s\n", cbuf); "Firmware Ver.: %s\n", cbuf);
strlcpy(cbuf, (char *)(port->identify+27), 41); strscpy(cbuf, (char *)(port->identify + 27), 41);
dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf); dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
dev_info(&port->dd->pdev->dev, "Security: %04x %s\n", dev_info(&port->dd->pdev->dev, "Security: %04x %s\n",
@@ -1421,13 +1421,13 @@ static void mtip_dump_identify(struct mtip_port *port)
pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid); pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid);
switch (revid & 0xFF) { switch (revid & 0xFF) {
case 0x1: case 0x1:
strlcpy(cbuf, "A0", 3); strscpy(cbuf, "A0", 3);
break; break;
case 0x3: case 0x3:
strlcpy(cbuf, "A2", 3); strscpy(cbuf, "A2", 3);
break; break;
default: default:
strlcpy(cbuf, "?", 2); strscpy(cbuf, "?", 2);
break; break;
} }
dev_info(&port->dd->pdev->dev, dev_info(&port->dd->pdev->dev,

View File

@@ -1413,10 +1413,12 @@ static int nbd_start_device_ioctl(struct nbd_device *nbd)
mutex_unlock(&nbd->config_lock); mutex_unlock(&nbd->config_lock);
ret = wait_event_interruptible(config->recv_wq, ret = wait_event_interruptible(config->recv_wq,
atomic_read(&config->recv_threads) == 0); atomic_read(&config->recv_threads) == 0);
if (ret) if (ret) {
sock_shutdown(nbd); sock_shutdown(nbd);
flush_workqueue(nbd->recv_workq); nbd_clear_que(nbd);
}
flush_workqueue(nbd->recv_workq);
mutex_lock(&nbd->config_lock); mutex_lock(&nbd->config_lock);
nbd_bdev_reset(nbd); nbd_bdev_reset(nbd);
/* user requested, ignore socket errors */ /* user requested, ignore socket errors */

View File

@@ -1528,7 +1528,7 @@ static bool should_requeue_request(struct request *rq)
return false; return false;
} }
static int null_map_queues(struct blk_mq_tag_set *set) static void null_map_queues(struct blk_mq_tag_set *set)
{ {
struct nullb *nullb = set->driver_data; struct nullb *nullb = set->driver_data;
int i, qoff; int i, qoff;
@@ -1555,7 +1555,9 @@ static int null_map_queues(struct blk_mq_tag_set *set)
} else { } else {
pr_warn("tag set has unexpected nr_hw_queues: %d\n", pr_warn("tag set has unexpected nr_hw_queues: %d\n",
set->nr_hw_queues); set->nr_hw_queues);
return -EINVAL; WARN_ON_ONCE(true);
submit_queues = 1;
poll_queues = 0;
} }
} }
@@ -1577,8 +1579,6 @@ static int null_map_queues(struct blk_mq_tag_set *set)
qoff += map->nr_queues; qoff += map->nr_queues;
blk_mq_map_queues(map); blk_mq_map_queues(map);
} }
return 0;
} }
static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)

View File

@@ -745,7 +745,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
gendisk->flags |= GENHD_FL_NO_PART; gendisk->flags |= GENHD_FL_NO_PART;
gendisk->fops = &ps3vram_fops; gendisk->fops = &ps3vram_fops;
gendisk->private_data = dev; gendisk->private_data = dev;
strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); strscpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name));
set_capacity(gendisk, priv->size >> 9); set_capacity(gendisk, priv->size >> 9);
blk_queue_max_segments(gendisk->queue, BLK_MAX_SEGMENTS); blk_queue_max_segments(gendisk->queue, BLK_MAX_SEGMENTS);
blk_queue_max_segment_size(gendisk->queue, BLK_MAX_SEGMENT_SIZE); blk_queue_max_segment_size(gendisk->queue, BLK_MAX_SEGMENT_SIZE);

View File

@@ -6,10 +6,12 @@ rnbd-client-y := rnbd-clt.o \
rnbd-clt-sysfs.o \ rnbd-clt-sysfs.o \
rnbd-common.o rnbd-common.o
CFLAGS_rnbd-srv-trace.o = -I$(src)
rnbd-server-y := rnbd-common.o \ rnbd-server-y := rnbd-common.o \
rnbd-srv.o \ rnbd-srv.o \
rnbd-srv-dev.o \ rnbd-srv-sysfs.o \
rnbd-srv-sysfs.o rnbd-srv-trace.o
obj-$(CONFIG_BLK_DEV_RNBD_CLIENT) += rnbd-client.o obj-$(CONFIG_BLK_DEV_RNBD_CLIENT) += rnbd-client.o
obj-$(CONFIG_BLK_DEV_RNBD_SERVER) += rnbd-server.o obj-$(CONFIG_BLK_DEV_RNBD_SERVER) += rnbd-server.o

View File

@@ -1159,13 +1159,11 @@ static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{ {
struct rnbd_queue *q = hctx->driver_data; struct rnbd_queue *q = hctx->driver_data;
struct rnbd_clt_dev *dev = q->dev; struct rnbd_clt_dev *dev = q->dev;
int cnt;
cnt = rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num); return rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num);
return cnt;
} }
static int rnbd_rdma_map_queues(struct blk_mq_tag_set *set) static void rnbd_rdma_map_queues(struct blk_mq_tag_set *set)
{ {
struct rnbd_clt_session *sess = set->driver_data; struct rnbd_clt_session *sess = set->driver_data;
@@ -1194,8 +1192,6 @@ static int rnbd_rdma_map_queues(struct blk_mq_tag_set *set)
set->map[HCTX_TYPE_DEFAULT].nr_queues, set->map[HCTX_TYPE_DEFAULT].nr_queues,
set->map[HCTX_TYPE_READ].nr_queues); set->map[HCTX_TYPE_READ].nr_queues);
} }
return 0;
} }
static struct blk_mq_ops rnbd_mq_ops = { static struct blk_mq_ops rnbd_mq_ops = {

View File

@@ -1,43 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* RDMA Network Block Driver
*
* Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
* Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
* Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
*/
#undef pr_fmt
#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
#include "rnbd-srv-dev.h"
#include "rnbd-log.h"
struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags)
{
struct rnbd_dev *dev;
int ret;
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev)
return ERR_PTR(-ENOMEM);
dev->blk_open_flags = flags;
dev->bdev = blkdev_get_by_path(path, flags, THIS_MODULE);
ret = PTR_ERR_OR_ZERO(dev->bdev);
if (ret)
goto err;
dev->blk_open_flags = flags;
return dev;
err:
kfree(dev);
return ERR_PTR(ret);
}
void rnbd_dev_close(struct rnbd_dev *dev)
{
blkdev_put(dev->bdev, dev->blk_open_flags);
kfree(dev);
}

View File

@@ -1,64 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* RDMA Network Block Driver
*
* Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
* Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
* Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
*/
#ifndef RNBD_SRV_DEV_H
#define RNBD_SRV_DEV_H
#include <linux/fs.h>
#include "rnbd-proto.h"
struct rnbd_dev {
struct block_device *bdev;
fmode_t blk_open_flags;
};
/**
* rnbd_dev_open() - Open a device
* @path: path to open
* @flags: open flags
*/
struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags);
/**
* rnbd_dev_close() - Close a device
*/
void rnbd_dev_close(struct rnbd_dev *dev);
void rnbd_endio(void *priv, int error);
static inline int rnbd_dev_get_max_segs(const struct rnbd_dev *dev)
{
return queue_max_segments(bdev_get_queue(dev->bdev));
}
static inline int rnbd_dev_get_max_hw_sects(const struct rnbd_dev *dev)
{
return queue_max_hw_sectors(bdev_get_queue(dev->bdev));
}
static inline int rnbd_dev_get_secure_discard(const struct rnbd_dev *dev)
{
return bdev_max_secure_erase_sectors(dev->bdev);
}
static inline int rnbd_dev_get_max_discard_sects(const struct rnbd_dev *dev)
{
return bdev_max_discard_sectors(dev->bdev);
}
static inline int rnbd_dev_get_discard_granularity(const struct rnbd_dev *dev)
{
return bdev_get_queue(dev->bdev)->limits.discard_granularity;
}
static inline int rnbd_dev_get_discard_alignment(const struct rnbd_dev *dev)
{
return bdev_discard_alignment(dev->bdev);
}
#endif /* RNBD_SRV_DEV_H */

View File

@@ -0,0 +1,17 @@
// SPDX-License-Identifier: GPL-2.0+
/*
* RDMA Network Block Driver
*
* Copyright (c) 2022 1&1 IONOS SE. All rights reserved.
*/
#include "rtrs.h"
#include "rtrs-srv.h"
#include "rnbd-srv.h"
#include "rnbd-proto.h"
/*
* We include this last to have the helpers above available for the trace
* event implementations.
*/
#define CREATE_TRACE_POINTS
#include "rnbd-srv-trace.h"

View File

@@ -0,0 +1,207 @@
/* SPDX-License-Identifier: GPL-2.0+ */
/*
* RDMA Network Block Driver
*
* Copyright (c) 2022 1&1 IONOS SE. All rights reserved.
*/
#undef TRACE_SYSTEM
#define TRACE_SYSTEM rnbd_srv
#if !defined(_TRACE_RNBD_SRV_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_RNBD_SRV_H
#include <linux/tracepoint.h>
struct rnbd_srv_session;
struct rtrs_srv_op;
DECLARE_EVENT_CLASS(rnbd_srv_link_class,
TP_PROTO(struct rnbd_srv_session *srv),
TP_ARGS(srv),
TP_STRUCT__entry(
__field(int, qdepth)
__string(sessname, srv->sessname)
),
TP_fast_assign(
__entry->qdepth = srv->queue_depth;
__assign_str(sessname, srv->sessname);
),
TP_printk("sessname: %s qdepth: %d",
__get_str(sessname),
__entry->qdepth
)
);
#define DEFINE_LINK_EVENT(name) \
DEFINE_EVENT(rnbd_srv_link_class, name, \
TP_PROTO(struct rnbd_srv_session *srv), \
TP_ARGS(srv))
DEFINE_LINK_EVENT(create_sess);
DEFINE_LINK_EVENT(destroy_sess);
TRACE_DEFINE_ENUM(RNBD_OP_READ);
TRACE_DEFINE_ENUM(RNBD_OP_WRITE);
TRACE_DEFINE_ENUM(RNBD_OP_FLUSH);
TRACE_DEFINE_ENUM(RNBD_OP_DISCARD);
TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE);
TRACE_DEFINE_ENUM(RNBD_F_SYNC);
TRACE_DEFINE_ENUM(RNBD_F_FUA);
#define show_rnbd_rw_flags(x) \
__print_flags(x, "|", \
{ RNBD_OP_READ, "READ" }, \
{ RNBD_OP_WRITE, "WRITE" }, \
{ RNBD_OP_FLUSH, "FLUSH" }, \
{ RNBD_OP_DISCARD, "DISCARD" }, \
{ RNBD_OP_SECURE_ERASE, "SECURE_ERASE" }, \
{ RNBD_F_SYNC, "SYNC" }, \
{ RNBD_F_FUA, "FUA" })
TRACE_EVENT(process_rdma,
TP_PROTO(struct rnbd_srv_session *srv,
const struct rnbd_msg_io *msg,
struct rtrs_srv_op *id,
u32 datalen,
size_t usrlen),
TP_ARGS(srv, msg, id, datalen, usrlen),
TP_STRUCT__entry(
__string(sessname, srv->sessname)
__field(u8, dir)
__field(u8, ver)
__field(u32, device_id)
__field(u64, sector)
__field(u32, flags)
__field(u32, bi_size)
__field(u16, ioprio)
__field(u32, datalen)
__field(size_t, usrlen)
),
TP_fast_assign(
__assign_str(sessname, srv->sessname);
__entry->dir = id->dir;
__entry->ver = srv->ver;
__entry->device_id = le32_to_cpu(msg->device_id);
__entry->sector = le64_to_cpu(msg->sector);
__entry->bi_size = le32_to_cpu(msg->bi_size);
__entry->flags = le32_to_cpu(msg->rw);
__entry->ioprio = le16_to_cpu(msg->prio);
__entry->datalen = datalen;
__entry->usrlen = usrlen;
),
TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu",
__get_str(sessname),
__print_symbolic(__entry->dir,
{ READ, "READ" },
{ WRITE, "WRITE" }),
__entry->ver,
__entry->device_id,
__entry->sector,
__entry->bi_size,
show_rnbd_rw_flags(__entry->flags),
__entry->ioprio,
__entry->datalen,
__entry->usrlen
)
);
TRACE_EVENT(process_msg_sess_info,
TP_PROTO(struct rnbd_srv_session *srv,
const struct rnbd_msg_sess_info *msg),
TP_ARGS(srv, msg),
TP_STRUCT__entry(
__field(u8, proto_ver)
__field(u8, clt_ver)
__field(u8, srv_ver)
__string(sessname, srv->sessname)
),
TP_fast_assign(
__entry->proto_ver = srv->ver;
__entry->clt_ver = msg->ver;
__entry->srv_ver = RNBD_PROTO_VER_MAJOR;
__assign_str(sessname, srv->sessname);
),
TP_printk("Session %s using proto-ver %d (clt-ver: %d, srv-ver: %d)",
__get_str(sessname),
__entry->proto_ver,
__entry->clt_ver,
__entry->srv_ver
)
);
TRACE_DEFINE_ENUM(RNBD_ACCESS_RO);
TRACE_DEFINE_ENUM(RNBD_ACCESS_RW);
TRACE_DEFINE_ENUM(RNBD_ACCESS_MIGRATION);
#define show_rnbd_access_mode(x) \
__print_symbolic(x, \
{ RNBD_ACCESS_RO, "RO" }, \
{ RNBD_ACCESS_RW, "RW" }, \
{ RNBD_ACCESS_MIGRATION, "MIGRATION" })
TRACE_EVENT(process_msg_open,
TP_PROTO(struct rnbd_srv_session *srv,
const struct rnbd_msg_open *msg),
TP_ARGS(srv, msg),
TP_STRUCT__entry(
__field(u8, access_mode)
__string(sessname, srv->sessname)
__string(dev_name, msg->dev_name)
),
TP_fast_assign(
__entry->access_mode = msg->access_mode;
__assign_str(sessname, srv->sessname);
__assign_str(dev_name, msg->dev_name);
),
TP_printk("Open message received: session='%s' path='%s' access_mode=%s",
__get_str(sessname),
__get_str(dev_name),
show_rnbd_access_mode(__entry->access_mode)
)
);
TRACE_EVENT(process_msg_close,
TP_PROTO(struct rnbd_srv_session *srv,
const struct rnbd_msg_close *msg),
TP_ARGS(srv, msg),
TP_STRUCT__entry(
__field(u32, device_id)
__string(sessname, srv->sessname)
),
TP_fast_assign(
__entry->device_id = le32_to_cpu(msg->device_id);
__assign_str(sessname, srv->sessname);
),
TP_printk("Close message received: session='%s' device id='%d'",
__get_str(sessname),
__entry->device_id
)
);
#endif /* _TRACE_RNBD_SRV_H */
#undef TRACE_INCLUDE_PATH
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE rnbd-srv-trace
#include <trace/define_trace.h>

View File

@@ -13,7 +13,7 @@
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include "rnbd-srv.h" #include "rnbd-srv.h"
#include "rnbd-srv-dev.h" #include "rnbd-srv-trace.h"
MODULE_DESCRIPTION("RDMA Network Block Device Server"); MODULE_DESCRIPTION("RDMA Network Block Device Server");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
@@ -84,18 +84,6 @@ static inline void rnbd_put_sess_dev(struct rnbd_srv_sess_dev *sess_dev)
kref_put(&sess_dev->kref, rnbd_sess_dev_release); kref_put(&sess_dev->kref, rnbd_sess_dev_release);
} }
void rnbd_endio(void *priv, int error)
{
struct rnbd_io_private *rnbd_priv = priv;
struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev;
rnbd_put_sess_dev(sess_dev);
rtrs_srv_resp_rdma(rnbd_priv->id, error);
kfree(priv);
}
static struct rnbd_srv_sess_dev * static struct rnbd_srv_sess_dev *
rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess) rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess)
{ {
@@ -116,7 +104,13 @@ rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess)
static void rnbd_dev_bi_end_io(struct bio *bio) static void rnbd_dev_bi_end_io(struct bio *bio)
{ {
rnbd_endio(bio->bi_private, blk_status_to_errno(bio->bi_status)); struct rnbd_io_private *rnbd_priv = bio->bi_private;
struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev;
rnbd_put_sess_dev(sess_dev);
rtrs_srv_resp_rdma(rnbd_priv->id, blk_status_to_errno(bio->bi_status));
kfree(rnbd_priv);
bio_put(bio); bio_put(bio);
} }
@@ -132,6 +126,8 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
struct bio *bio; struct bio *bio;
short prio; short prio;
trace_process_rdma(srv_sess, msg, id, datalen, usrlen);
priv = kmalloc(sizeof(*priv), GFP_KERNEL); priv = kmalloc(sizeof(*priv), GFP_KERNEL);
if (!priv) if (!priv)
return -ENOMEM; return -ENOMEM;
@@ -149,7 +145,7 @@ static int process_rdma(struct rnbd_srv_session *srv_sess,
priv->sess_dev = sess_dev; priv->sess_dev = sess_dev;
priv->id = id; priv->id = id;
bio = bio_alloc(sess_dev->rnbd_dev->bdev, 1, bio = bio_alloc(sess_dev->bdev, 1,
rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL); rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
if (bio_add_page(bio, virt_to_page(data), datalen, if (bio_add_page(bio, virt_to_page(data), datalen,
offset_in_page(data)) != datalen) { offset_in_page(data)) != datalen) {
@@ -223,7 +219,7 @@ void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
rnbd_put_sess_dev(sess_dev); rnbd_put_sess_dev(sess_dev);
wait_for_completion(&dc); /* wait for inflights to drop to zero */ wait_for_completion(&dc); /* wait for inflights to drop to zero */
rnbd_dev_close(sess_dev->rnbd_dev); blkdev_put(sess_dev->bdev, sess_dev->open_flags);
mutex_lock(&sess_dev->dev->lock); mutex_lock(&sess_dev->dev->lock);
list_del(&sess_dev->dev_list); list_del(&sess_dev->dev_list);
if (sess_dev->open_flags & FMODE_WRITE) if (sess_dev->open_flags & FMODE_WRITE)
@@ -244,6 +240,8 @@ static void destroy_sess(struct rnbd_srv_session *srv_sess)
if (xa_empty(&srv_sess->index_idr)) if (xa_empty(&srv_sess->index_idr))
goto out; goto out;
trace_destroy_sess(srv_sess);
mutex_lock(&srv_sess->lock); mutex_lock(&srv_sess->lock);
xa_for_each(&srv_sess->index_idr, index, sess_dev) xa_for_each(&srv_sess->index_idr, index, sess_dev)
rnbd_srv_destroy_dev_session_sysfs(sess_dev); rnbd_srv_destroy_dev_session_sysfs(sess_dev);
@@ -290,6 +288,8 @@ static int create_sess(struct rtrs_srv_sess *rtrs)
rtrs_srv_set_sess_priv(rtrs, srv_sess); rtrs_srv_set_sess_priv(rtrs, srv_sess);
trace_create_sess(srv_sess);
return 0; return 0;
} }
@@ -332,23 +332,24 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev,
mutex_unlock(&sess->lock); mutex_unlock(&sess->lock);
} }
static int process_msg_close(struct rnbd_srv_session *srv_sess, static void process_msg_close(struct rnbd_srv_session *srv_sess,
void *data, size_t datalen, const void *usr, void *data, size_t datalen, const void *usr,
size_t usrlen) size_t usrlen)
{ {
const struct rnbd_msg_close *close_msg = usr; const struct rnbd_msg_close *close_msg = usr;
struct rnbd_srv_sess_dev *sess_dev; struct rnbd_srv_sess_dev *sess_dev;
trace_process_msg_close(srv_sess, close_msg);
sess_dev = rnbd_get_sess_dev(le32_to_cpu(close_msg->device_id), sess_dev = rnbd_get_sess_dev(le32_to_cpu(close_msg->device_id),
srv_sess); srv_sess);
if (IS_ERR(sess_dev)) if (IS_ERR(sess_dev))
return 0; return;
rnbd_put_sess_dev(sess_dev); rnbd_put_sess_dev(sess_dev);
mutex_lock(&srv_sess->lock); mutex_lock(&srv_sess->lock);
rnbd_srv_destroy_dev_session_sysfs(sess_dev); rnbd_srv_destroy_dev_session_sysfs(sess_dev);
mutex_unlock(&srv_sess->lock); mutex_unlock(&srv_sess->lock);
return 0;
} }
static int process_msg_open(struct rnbd_srv_session *srv_sess, static int process_msg_open(struct rnbd_srv_session *srv_sess,
@@ -378,7 +379,7 @@ static int rnbd_srv_rdma_ev(void *priv,
case RNBD_MSG_IO: case RNBD_MSG_IO:
return process_rdma(srv_sess, id, data, datalen, usr, usrlen); return process_rdma(srv_sess, id, data, datalen, usr, usrlen);
case RNBD_MSG_CLOSE: case RNBD_MSG_CLOSE:
ret = process_msg_close(srv_sess, data, datalen, usr, usrlen); process_msg_close(srv_sess, data, datalen, usr, usrlen);
break; break;
case RNBD_MSG_OPEN: case RNBD_MSG_OPEN:
ret = process_msg_open(srv_sess, usr, usrlen, data, datalen); ret = process_msg_open(srv_sess, usr, usrlen, data, datalen);
@@ -393,6 +394,11 @@ static int rnbd_srv_rdma_ev(void *priv,
return -EINVAL; return -EINVAL;
} }
/*
* Since ret is passed to rtrs to handle the failure case, we
* just return 0 at the end otherwise callers in rtrs would call
* send_io_resp_imm again to print redundant err message.
*/
rtrs_srv_resp_rdma(id, ret); rtrs_srv_resp_rdma(id, ret);
return 0; return 0;
} }
@@ -504,14 +510,14 @@ static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev,
} }
static struct rnbd_srv_dev * static struct rnbd_srv_dev *
rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, rnbd_srv_get_or_create_srv_dev(struct block_device *bdev,
struct rnbd_srv_session *srv_sess, struct rnbd_srv_session *srv_sess,
enum rnbd_access_mode access_mode) enum rnbd_access_mode access_mode)
{ {
int ret; int ret;
struct rnbd_srv_dev *new_dev, *dev; struct rnbd_srv_dev *new_dev, *dev;
new_dev = rnbd_srv_init_srv_dev(rnbd_dev->bdev); new_dev = rnbd_srv_init_srv_dev(bdev);
if (IS_ERR(new_dev)) if (IS_ERR(new_dev))
return new_dev; return new_dev;
@@ -531,41 +537,32 @@ rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev,
static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp, static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
struct rnbd_srv_sess_dev *sess_dev) struct rnbd_srv_sess_dev *sess_dev)
{ {
struct rnbd_dev *rnbd_dev = sess_dev->rnbd_dev; struct block_device *bdev = sess_dev->bdev;
rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP); rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP);
rsp->device_id = rsp->device_id = cpu_to_le32(sess_dev->device_id);
cpu_to_le32(sess_dev->device_id); rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev));
rsp->nsectors = rsp->logical_block_size = cpu_to_le16(bdev_logical_block_size(bdev));
cpu_to_le64(get_capacity(rnbd_dev->bdev->bd_disk)); rsp->physical_block_size = cpu_to_le16(bdev_physical_block_size(bdev));
rsp->logical_block_size = rsp->max_segments = cpu_to_le16(bdev_max_segments(bdev));
cpu_to_le16(bdev_logical_block_size(rnbd_dev->bdev));
rsp->physical_block_size =
cpu_to_le16(bdev_physical_block_size(rnbd_dev->bdev));
rsp->max_segments =
cpu_to_le16(rnbd_dev_get_max_segs(rnbd_dev));
rsp->max_hw_sectors = rsp->max_hw_sectors =
cpu_to_le32(rnbd_dev_get_max_hw_sects(rnbd_dev)); cpu_to_le32(queue_max_hw_sectors(bdev_get_queue(bdev)));
rsp->max_write_same_sectors = 0; rsp->max_write_same_sectors = 0;
rsp->max_discard_sectors = rsp->max_discard_sectors = cpu_to_le32(bdev_max_discard_sectors(bdev));
cpu_to_le32(rnbd_dev_get_max_discard_sects(rnbd_dev)); rsp->discard_granularity = cpu_to_le32(bdev_discard_granularity(bdev));
rsp->discard_granularity = rsp->discard_alignment = cpu_to_le32(bdev_discard_alignment(bdev));
cpu_to_le32(rnbd_dev_get_discard_granularity(rnbd_dev)); rsp->secure_discard = cpu_to_le16(bdev_max_secure_erase_sectors(bdev));
rsp->discard_alignment =
cpu_to_le32(rnbd_dev_get_discard_alignment(rnbd_dev));
rsp->secure_discard =
cpu_to_le16(rnbd_dev_get_secure_discard(rnbd_dev));
rsp->cache_policy = 0; rsp->cache_policy = 0;
if (bdev_write_cache(rnbd_dev->bdev)) if (bdev_write_cache(bdev))
rsp->cache_policy |= RNBD_WRITEBACK; rsp->cache_policy |= RNBD_WRITEBACK;
if (bdev_fua(rnbd_dev->bdev)) if (bdev_fua(bdev))
rsp->cache_policy |= RNBD_FUA; rsp->cache_policy |= RNBD_FUA;
} }
static struct rnbd_srv_sess_dev * static struct rnbd_srv_sess_dev *
rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess, rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess,
const struct rnbd_msg_open *open_msg, const struct rnbd_msg_open *open_msg,
struct rnbd_dev *rnbd_dev, fmode_t open_flags, struct block_device *bdev, fmode_t open_flags,
struct rnbd_srv_dev *srv_dev) struct rnbd_srv_dev *srv_dev)
{ {
struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess); struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess);
@@ -577,7 +574,7 @@ rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess,
strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname)); strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname));
sdev->rnbd_dev = rnbd_dev; sdev->bdev = bdev;
sdev->sess = srv_sess; sdev->sess = srv_sess;
sdev->dev = srv_dev; sdev->dev = srv_dev;
sdev->open_flags = open_flags; sdev->open_flags = open_flags;
@@ -643,9 +640,8 @@ static int process_msg_sess_info(struct rnbd_srv_session *srv_sess,
struct rnbd_msg_sess_info_rsp *rsp = data; struct rnbd_msg_sess_info_rsp *rsp = data;
srv_sess->ver = min_t(u8, sess_info_msg->ver, RNBD_PROTO_VER_MAJOR); srv_sess->ver = min_t(u8, sess_info_msg->ver, RNBD_PROTO_VER_MAJOR);
pr_debug("Session %s using protocol version %d (client version: %d, server version: %d)\n",
srv_sess->sessname, srv_sess->ver, trace_process_msg_sess_info(srv_sess, sess_info_msg);
sess_info_msg->ver, RNBD_PROTO_VER_MAJOR);
rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP); rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP);
rsp->ver = srv_sess->ver; rsp->ver = srv_sess->ver;
@@ -685,14 +681,13 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
struct rnbd_srv_dev *srv_dev; struct rnbd_srv_dev *srv_dev;
struct rnbd_srv_sess_dev *srv_sess_dev; struct rnbd_srv_sess_dev *srv_sess_dev;
const struct rnbd_msg_open *open_msg = msg; const struct rnbd_msg_open *open_msg = msg;
struct block_device *bdev;
fmode_t open_flags; fmode_t open_flags;
char *full_path; char *full_path;
struct rnbd_dev *rnbd_dev;
struct rnbd_msg_open_rsp *rsp = data; struct rnbd_msg_open_rsp *rsp = data;
pr_debug("Open message received: session='%s' path='%s' access_mode=%d\n", trace_process_msg_open(srv_sess, open_msg);
srv_sess->sessname, open_msg->dev_name,
open_msg->access_mode);
open_flags = FMODE_READ; open_flags = FMODE_READ;
if (open_msg->access_mode != RNBD_ACCESS_RO) if (open_msg->access_mode != RNBD_ACCESS_RO)
open_flags |= FMODE_WRITE; open_flags |= FMODE_WRITE;
@@ -725,25 +720,25 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
goto reject; goto reject;
} }
rnbd_dev = rnbd_dev_open(full_path, open_flags); bdev = blkdev_get_by_path(full_path, open_flags, THIS_MODULE);
if (IS_ERR(rnbd_dev)) { if (IS_ERR(bdev)) {
pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %ld\n", ret = PTR_ERR(bdev);
full_path, srv_sess->sessname, PTR_ERR(rnbd_dev)); pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %d\n",
ret = PTR_ERR(rnbd_dev); full_path, srv_sess->sessname, ret);
goto free_path; goto free_path;
} }
srv_dev = rnbd_srv_get_or_create_srv_dev(rnbd_dev, srv_sess, srv_dev = rnbd_srv_get_or_create_srv_dev(bdev, srv_sess,
open_msg->access_mode); open_msg->access_mode);
if (IS_ERR(srv_dev)) { if (IS_ERR(srv_dev)) {
pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n", pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n",
full_path, srv_sess->sessname, PTR_ERR(srv_dev)); full_path, srv_sess->sessname, PTR_ERR(srv_dev));
ret = PTR_ERR(srv_dev); ret = PTR_ERR(srv_dev);
goto rnbd_dev_close; goto blkdev_put;
} }
srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg, srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg,
rnbd_dev, open_flags, bdev, open_flags,
srv_dev); srv_dev);
if (IS_ERR(srv_sess_dev)) { if (IS_ERR(srv_sess_dev)) {
pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n", pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n",
@@ -758,7 +753,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess,
*/ */
mutex_lock(&srv_dev->lock); mutex_lock(&srv_dev->lock);
if (!srv_dev->dev_kobj.state_in_sysfs) { if (!srv_dev->dev_kobj.state_in_sysfs) {
ret = rnbd_srv_create_dev_sysfs(srv_dev, rnbd_dev->bdev); ret = rnbd_srv_create_dev_sysfs(srv_dev, bdev);
if (ret) { if (ret) {
mutex_unlock(&srv_dev->lock); mutex_unlock(&srv_dev->lock);
rnbd_srv_err(srv_sess_dev, rnbd_srv_err(srv_sess_dev,
@@ -800,8 +795,8 @@ srv_dev_put:
mutex_unlock(&srv_dev->lock); mutex_unlock(&srv_dev->lock);
} }
rnbd_put_srv_dev(srv_dev); rnbd_put_srv_dev(srv_dev);
rnbd_dev_close: blkdev_put:
rnbd_dev_close(rnbd_dev); blkdev_put(bdev, open_flags);
free_path: free_path:
kfree(full_path); kfree(full_path);
reject: reject:

View File

@@ -46,7 +46,7 @@ struct rnbd_srv_dev {
struct rnbd_srv_sess_dev { struct rnbd_srv_sess_dev {
/* Entry inside rnbd_srv_dev struct */ /* Entry inside rnbd_srv_dev struct */
struct list_head dev_list; struct list_head dev_list;
struct rnbd_dev *rnbd_dev; struct block_device *bdev;
struct rnbd_srv_session *sess; struct rnbd_srv_session *sess;
struct rnbd_srv_dev *dev; struct rnbd_srv_dev *dev;
struct kobject kobj; struct kobject kobj;

View File

@@ -49,7 +49,9 @@
/* All UBLK_F_* have to be included into UBLK_F_ALL */ /* All UBLK_F_* have to be included into UBLK_F_ALL */
#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \ #define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
| UBLK_F_URING_CMD_COMP_IN_TASK \ | UBLK_F_URING_CMD_COMP_IN_TASK \
| UBLK_F_NEED_GET_DATA) | UBLK_F_NEED_GET_DATA \
| UBLK_F_USER_RECOVERY \
| UBLK_F_USER_RECOVERY_REISSUE)
/* All UBLK_PARAM_TYPE_* should be included here */ /* All UBLK_PARAM_TYPE_* should be included here */
#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD) #define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
@@ -119,7 +121,7 @@ struct ublk_queue {
unsigned long io_addr; /* mapped vm address */ unsigned long io_addr; /* mapped vm address */
unsigned int max_io_sz; unsigned int max_io_sz;
bool abort_work_pending; bool force_abort;
unsigned short nr_io_ready; /* how many ios setup */ unsigned short nr_io_ready; /* how many ios setup */
struct ublk_device *dev; struct ublk_device *dev;
struct ublk_io ios[0]; struct ublk_io ios[0];
@@ -161,6 +163,7 @@ struct ublk_device {
* monitor each queue's daemon periodically * monitor each queue's daemon periodically
*/ */
struct delayed_work monitor_work; struct delayed_work monitor_work;
struct work_struct quiesce_work;
struct work_struct stop_work; struct work_struct stop_work;
}; };
@@ -323,6 +326,30 @@ static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
PAGE_SIZE); PAGE_SIZE);
} }
static inline bool ublk_queue_can_use_recovery_reissue(
struct ublk_queue *ubq)
{
if ((ubq->flags & UBLK_F_USER_RECOVERY) &&
(ubq->flags & UBLK_F_USER_RECOVERY_REISSUE))
return true;
return false;
}
static inline bool ublk_queue_can_use_recovery(
struct ublk_queue *ubq)
{
if (ubq->flags & UBLK_F_USER_RECOVERY)
return true;
return false;
}
static inline bool ublk_can_use_recovery(struct ublk_device *ub)
{
if (ub->dev_info.flags & UBLK_F_USER_RECOVERY)
return true;
return false;
}
static void ublk_free_disk(struct gendisk *disk) static void ublk_free_disk(struct gendisk *disk)
{ {
struct ublk_device *ub = disk->private_data; struct ublk_device *ub = disk->private_data;
@@ -612,13 +639,17 @@ static void ublk_complete_rq(struct request *req)
* Also aborting may not be started yet, keep in mind that one failed * Also aborting may not be started yet, keep in mind that one failed
* request may be issued by block layer again. * request may be issued by block layer again.
*/ */
static void __ublk_fail_req(struct ublk_io *io, struct request *req) static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
struct request *req)
{ {
WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
if (!(io->flags & UBLK_IO_FLAG_ABORTED)) { if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
io->flags |= UBLK_IO_FLAG_ABORTED; io->flags |= UBLK_IO_FLAG_ABORTED;
blk_mq_end_request(req, BLK_STS_IOERR); if (ublk_queue_can_use_recovery_reissue(ubq))
blk_mq_requeue_request(req, false);
else
blk_mq_end_request(req, BLK_STS_IOERR);
} }
} }
@@ -639,22 +670,40 @@ static void ubq_complete_io_cmd(struct ublk_io *io, int res)
#define UBLK_REQUEUE_DELAY_MS 3 #define UBLK_REQUEUE_DELAY_MS 3
static inline void __ublk_abort_rq(struct ublk_queue *ubq,
struct request *rq)
{
/* We cannot process this rq so just requeue it. */
if (ublk_queue_can_use_recovery(ubq))
blk_mq_requeue_request(rq, false);
else
blk_mq_end_request(rq, BLK_STS_IOERR);
mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
}
static inline void __ublk_rq_task_work(struct request *req) static inline void __ublk_rq_task_work(struct request *req)
{ {
struct ublk_queue *ubq = req->mq_hctx->driver_data; struct ublk_queue *ubq = req->mq_hctx->driver_data;
struct ublk_device *ub = ubq->dev;
int tag = req->tag; int tag = req->tag;
struct ublk_io *io = &ubq->ios[tag]; struct ublk_io *io = &ubq->ios[tag];
bool task_exiting = current != ubq->ubq_daemon || ubq_daemon_is_dying(ubq);
unsigned int mapped_bytes; unsigned int mapped_bytes;
pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
ublk_get_iod(ubq, req->tag)->addr); ublk_get_iod(ubq, req->tag)->addr);
if (unlikely(task_exiting)) { /*
blk_mq_end_request(req, BLK_STS_IOERR); * Task is exiting if either:
mod_delayed_work(system_wq, &ub->monitor_work, 0); *
* (1) current != ubq_daemon.
* io_uring_cmd_complete_in_task() tries to run task_work
* in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
*
* (2) current->flags & PF_EXITING.
*/
if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
__ublk_abort_rq(ubq, req);
return; return;
} }
@@ -739,13 +788,24 @@ static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
res = ublk_setup_iod(ubq, rq); res = ublk_setup_iod(ubq, rq);
if (unlikely(res != BLK_STS_OK)) if (unlikely(res != BLK_STS_OK))
return BLK_STS_IOERR; return BLK_STS_IOERR;
/* With recovery feature enabled, force_abort is set in
* ublk_stop_dev() before calling del_gendisk(). We have to
* abort all requeued and new rqs here to let del_gendisk()
* move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
* to avoid UAF on io_uring ctx.
*
* Note: force_abort is guaranteed to be seen because it is set
* before request queue is unqiuesced.
*/
if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
return BLK_STS_IOERR;
blk_mq_start_request(bd->rq); blk_mq_start_request(bd->rq);
if (unlikely(ubq_daemon_is_dying(ubq))) { if (unlikely(ubq_daemon_is_dying(ubq))) {
fail: fail:
mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); __ublk_abort_rq(ubq, rq);
return BLK_STS_IOERR; return BLK_STS_OK;
} }
if (ublk_can_use_task_work(ubq)) { if (ublk_can_use_task_work(ubq)) {
@@ -916,7 +976,7 @@ static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
*/ */
rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
if (rq) if (rq)
__ublk_fail_req(io, rq); __ublk_fail_req(ubq, io, rq);
} }
} }
ublk_put_device(ub); ublk_put_device(ub);
@@ -932,7 +992,10 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
struct ublk_queue *ubq = ublk_get_queue(ub, i); struct ublk_queue *ubq = ublk_get_queue(ub, i);
if (ubq_daemon_is_dying(ubq)) { if (ubq_daemon_is_dying(ubq)) {
schedule_work(&ub->stop_work); if (ublk_queue_can_use_recovery(ubq))
schedule_work(&ub->quiesce_work);
else
schedule_work(&ub->stop_work);
/* abort queue is for making forward progress */ /* abort queue is for making forward progress */
ublk_abort_queue(ub, ubq); ublk_abort_queue(ub, ubq);
@@ -940,12 +1003,13 @@ static void ublk_daemon_monitor_work(struct work_struct *work)
} }
/* /*
* We can't schedule monitor work after ublk_remove() is started. * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
* after ublk_remove() or __ublk_quiesce_dev() is started.
* *
* No need ub->mutex, monitor work are canceled after state is marked * No need ub->mutex, monitor work are canceled after state is marked
* as DEAD, so DEAD state is observed reliably. * as not LIVE, so new state is observed reliably.
*/ */
if (ub->dev_info.state != UBLK_S_DEV_DEAD) if (ub->dev_info.state == UBLK_S_DEV_LIVE)
schedule_delayed_work(&ub->monitor_work, schedule_delayed_work(&ub->monitor_work,
UBLK_DAEMON_MONITOR_PERIOD); UBLK_DAEMON_MONITOR_PERIOD);
} }
@@ -982,12 +1046,97 @@ static void ublk_cancel_dev(struct ublk_device *ub)
ublk_cancel_queue(ublk_get_queue(ub, i)); ublk_cancel_queue(ublk_get_queue(ub, i));
} }
static void ublk_stop_dev(struct ublk_device *ub) static bool ublk_check_inflight_rq(struct request *rq, void *data)
{ {
bool *idle = data;
if (blk_mq_request_started(rq)) {
*idle = false;
return false;
}
return true;
}
static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
{
bool idle;
WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
while (true) {
idle = true;
blk_mq_tagset_busy_iter(&ub->tag_set,
ublk_check_inflight_rq, &idle);
if (idle)
break;
msleep(UBLK_REQUEUE_DELAY_MS);
}
}
static void __ublk_quiesce_dev(struct ublk_device *ub)
{
pr_devel("%s: quiesce ub: dev_id %d state %s\n",
__func__, ub->dev_info.dev_id,
ub->dev_info.state == UBLK_S_DEV_LIVE ?
"LIVE" : "QUIESCED");
blk_mq_quiesce_queue(ub->ub_disk->queue);
ublk_wait_tagset_rqs_idle(ub);
ub->dev_info.state = UBLK_S_DEV_QUIESCED;
ublk_cancel_dev(ub);
/* we are going to release task_struct of ubq_daemon and resets
* ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
* Besides, monitor_work is not necessary in QUIESCED state since we have
* already scheduled quiesce_work and quiesced all ubqs.
*
* Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
* it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
*/
cancel_delayed_work_sync(&ub->monitor_work);
}
static void ublk_quiesce_work_fn(struct work_struct *work)
{
struct ublk_device *ub =
container_of(work, struct ublk_device, quiesce_work);
mutex_lock(&ub->mutex); mutex_lock(&ub->mutex);
if (ub->dev_info.state != UBLK_S_DEV_LIVE) if (ub->dev_info.state != UBLK_S_DEV_LIVE)
goto unlock; goto unlock;
__ublk_quiesce_dev(ub);
unlock:
mutex_unlock(&ub->mutex);
}
static void ublk_unquiesce_dev(struct ublk_device *ub)
{
int i;
pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
__func__, ub->dev_info.dev_id,
ub->dev_info.state == UBLK_S_DEV_LIVE ?
"LIVE" : "QUIESCED");
/* quiesce_work has run. We let requeued rqs be aborted
* before running fallback_wq. "force_abort" must be seen
* after request queue is unqiuesced. Then del_gendisk()
* can move on.
*/
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_get_queue(ub, i)->force_abort = true;
blk_mq_unquiesce_queue(ub->ub_disk->queue);
/* We may have requeued some rqs in ublk_quiesce_queue() */
blk_mq_kick_requeue_list(ub->ub_disk->queue);
}
static void ublk_stop_dev(struct ublk_device *ub)
{
mutex_lock(&ub->mutex);
if (ub->dev_info.state == UBLK_S_DEV_DEAD)
goto unlock;
if (ublk_can_use_recovery(ub)) {
if (ub->dev_info.state == UBLK_S_DEV_LIVE)
__ublk_quiesce_dev(ub);
ublk_unquiesce_dev(ub);
}
del_gendisk(ub->ub_disk); del_gendisk(ub->ub_disk);
ub->dev_info.state = UBLK_S_DEV_DEAD; ub->dev_info.state = UBLK_S_DEV_DEAD;
ub->dev_info.ublksrv_pid = -1; ub->dev_info.ublksrv_pid = -1;
@@ -1311,6 +1460,7 @@ static void ublk_remove(struct ublk_device *ub)
{ {
ublk_stop_dev(ub); ublk_stop_dev(ub);
cancel_work_sync(&ub->stop_work); cancel_work_sync(&ub->stop_work);
cancel_work_sync(&ub->quiesce_work);
cdev_device_del(&ub->cdev, &ub->cdev_dev); cdev_device_del(&ub->cdev, &ub->cdev_dev);
put_device(&ub->cdev_dev); put_device(&ub->cdev_dev);
} }
@@ -1487,6 +1637,7 @@ static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
goto out_unlock; goto out_unlock;
mutex_init(&ub->mutex); mutex_init(&ub->mutex);
spin_lock_init(&ub->mm_lock); spin_lock_init(&ub->mm_lock);
INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
INIT_WORK(&ub->stop_work, ublk_stop_work_fn); INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work); INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
@@ -1607,6 +1758,7 @@ static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd)
ublk_stop_dev(ub); ublk_stop_dev(ub);
cancel_work_sync(&ub->stop_work); cancel_work_sync(&ub->stop_work);
cancel_work_sync(&ub->quiesce_work);
ublk_put_device(ub); ublk_put_device(ub);
return 0; return 0;
@@ -1709,6 +1861,116 @@ static int ublk_ctrl_set_params(struct io_uring_cmd *cmd)
return ret; return ret;
} }
static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
{
int i;
WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
/* All old ioucmds have to be completed */
WARN_ON_ONCE(ubq->nr_io_ready);
/* old daemon is PF_EXITING, put it now */
put_task_struct(ubq->ubq_daemon);
/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
ubq->ubq_daemon = NULL;
for (i = 0; i < ubq->q_depth; i++) {
struct ublk_io *io = &ubq->ios[i];
/* forget everything now and be ready for new FETCH_REQ */
io->flags = 0;
io->cmd = NULL;
io->addr = 0;
}
}
static int ublk_ctrl_start_recovery(struct io_uring_cmd *cmd)
{
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
struct ublk_device *ub;
int ret = -EINVAL;
int i;
ub = ublk_get_device_from_id(header->dev_id);
if (!ub)
return ret;
mutex_lock(&ub->mutex);
if (!ublk_can_use_recovery(ub))
goto out_unlock;
/*
* START_RECOVERY is only allowd after:
*
* (1) UB_STATE_OPEN is not set, which means the dying process is exited
* and related io_uring ctx is freed so file struct of /dev/ublkcX is
* released.
*
* (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
* (a)has quiesced request queue
* (b)has requeued every inflight rqs whose io_flags is ACTIVE
* (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
* (d)has completed/camceled all ioucmds owned by ther dying process
*/
if (test_bit(UB_STATE_OPEN, &ub->state) ||
ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
ret = -EBUSY;
goto out_unlock;
}
pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_queue_reinit(ub, ublk_get_queue(ub, i));
/* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
ub->mm = NULL;
ub->nr_queues_ready = 0;
init_completion(&ub->completion);
ret = 0;
out_unlock:
mutex_unlock(&ub->mutex);
ublk_put_device(ub);
return ret;
}
static int ublk_ctrl_end_recovery(struct io_uring_cmd *cmd)
{
struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
int ublksrv_pid = (int)header->data[0];
struct ublk_device *ub;
int ret = -EINVAL;
ub = ublk_get_device_from_id(header->dev_id);
if (!ub)
return ret;
pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
__func__, ub->dev_info.nr_hw_queues, header->dev_id);
/* wait until new ubq_daemon sending all FETCH_REQ */
wait_for_completion_interruptible(&ub->completion);
pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
__func__, ub->dev_info.nr_hw_queues, header->dev_id);
mutex_lock(&ub->mutex);
if (!ublk_can_use_recovery(ub))
goto out_unlock;
if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
ret = -EBUSY;
goto out_unlock;
}
ub->dev_info.ublksrv_pid = ublksrv_pid;
pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
__func__, ublksrv_pid, header->dev_id);
blk_mq_unquiesce_queue(ub->ub_disk->queue);
pr_devel("%s: queue unquiesced, dev id %d.\n",
__func__, header->dev_id);
blk_mq_kick_requeue_list(ub->ub_disk->queue);
ub->dev_info.state = UBLK_S_DEV_LIVE;
schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
ret = 0;
out_unlock:
mutex_unlock(&ub->mutex);
ublk_put_device(ub);
return ret;
}
static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
unsigned int issue_flags) unsigned int issue_flags)
{ {
@@ -1750,6 +2012,12 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
case UBLK_CMD_SET_PARAMS: case UBLK_CMD_SET_PARAMS:
ret = ublk_ctrl_set_params(cmd); ret = ublk_ctrl_set_params(cmd);
break; break;
case UBLK_CMD_START_USER_RECOVERY:
ret = ublk_ctrl_start_recovery(cmd);
break;
case UBLK_CMD_END_USER_RECOVERY:
ret = ublk_ctrl_end_recovery(cmd);
break;
default: default:
break; break;
} }

View File

@@ -801,7 +801,7 @@ static const struct attribute_group *virtblk_attr_groups[] = {
NULL, NULL,
}; };
static int virtblk_map_queues(struct blk_mq_tag_set *set) static void virtblk_map_queues(struct blk_mq_tag_set *set)
{ {
struct virtio_blk *vblk = set->driver_data; struct virtio_blk *vblk = set->driver_data;
int i, qoff; int i, qoff;
@@ -826,8 +826,6 @@ static int virtblk_map_queues(struct blk_mq_tag_set *set)
else else
blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0); blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
} }
return 0;
} }
static void virtblk_complete_batch(struct io_comp_batch *iob) static void virtblk_complete_batch(struct io_comp_batch *iob)

View File

@@ -499,7 +499,7 @@ static ssize_t backing_dev_store(struct device *dev,
goto out; goto out;
} }
strlcpy(file_name, buf, PATH_MAX); strscpy(file_name, buf, PATH_MAX);
/* ignore trailing newline */ /* ignore trailing newline */
sz = strlen(file_name); sz = strlen(file_name);
if (sz > 0 && file_name[sz - 1] == '\n') if (sz > 0 && file_name[sz - 1] == '\n')
@@ -1031,7 +1031,7 @@ static ssize_t comp_algorithm_store(struct device *dev,
char compressor[ARRAY_SIZE(zram->compressor)]; char compressor[ARRAY_SIZE(zram->compressor)];
size_t sz; size_t sz;
strlcpy(compressor, buf, sizeof(compressor)); strscpy(compressor, buf, sizeof(compressor));
/* ignore trailing newline */ /* ignore trailing newline */
sz = strlen(compressor); sz = strlen(compressor);
if (sz > 0 && compressor[sz - 1] == '\n') if (sz > 0 && compressor[sz - 1] == '\n')
@@ -1974,7 +1974,7 @@ static int zram_add(void)
if (ret) if (ret)
goto out_cleanup_disk; goto out_cleanup_disk;
strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); strscpy(zram->compressor, default_compressor, sizeof(zram->compressor));
zram_debugfs_register(zram); zram_debugfs_register(zram);
pr_info("Added device: %s\n", zram->disk->disk_name); pr_info("Added device: %s\n", zram->disk->disk_name);

View File

@@ -107,7 +107,7 @@
* *
* BTREE NODES: * BTREE NODES:
* *
* Our unit of allocation is a bucket, and we we can't arbitrarily allocate and * Our unit of allocation is a bucket, and we can't arbitrarily allocate and
* free smaller than a bucket - so, that's how big our btree nodes are. * free smaller than a bucket - so, that's how big our btree nodes are.
* *
* (If buckets are really big we'll only use part of the bucket for a btree node * (If buckets are really big we'll only use part of the bucket for a btree node

View File

@@ -1264,7 +1264,7 @@ static void __btree_sort(struct btree_keys *b, struct btree_iter *iter,
* *
* Don't worry event 'out' is allocated from mempool, it can * Don't worry event 'out' is allocated from mempool, it can
* still be swapped here. Because state->pool is a page mempool * still be swapped here. Because state->pool is a page mempool
* creaated by by mempool_init_page_pool(), which allocates * created by mempool_init_page_pool(), which allocates
* pages by alloc_pages() indeed. * pages by alloc_pages() indeed.
*/ */

View File

@@ -54,7 +54,6 @@ void bch_cache_accounting_destroy(struct cache_accounting *acc);
void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d, void bch_mark_cache_accounting(struct cache_set *c, struct bcache_device *d,
bool hit, bool bypass); bool hit, bool bypass);
void bch_mark_cache_readahead(struct cache_set *c, struct bcache_device *d);
void bch_mark_cache_miss_collision(struct cache_set *c, void bch_mark_cache_miss_collision(struct cache_set *c,
struct bcache_device *d); struct bcache_device *d);
void bch_mark_sectors_bypassed(struct cache_set *c, void bch_mark_sectors_bypassed(struct cache_set *c,

View File

@@ -157,6 +157,53 @@ static void __update_writeback_rate(struct cached_dev *dc)
dc->writeback_rate_target = target; dc->writeback_rate_target = target;
} }
static bool idle_counter_exceeded(struct cache_set *c)
{
int counter, dev_nr;
/*
* If c->idle_counter is overflow (idel for really long time),
* reset as 0 and not set maximum rate this time for code
* simplicity.
*/
counter = atomic_inc_return(&c->idle_counter);
if (counter <= 0) {
atomic_set(&c->idle_counter, 0);
return false;
}
dev_nr = atomic_read(&c->attached_dev_nr);
if (dev_nr == 0)
return false;
/*
* c->idle_counter is increased by writeback thread of all
* attached backing devices, in order to represent a rough
* time period, counter should be divided by dev_nr.
* Otherwise the idle time cannot be larger with more backing
* device attached.
* The following calculation equals to checking
* (counter / dev_nr) < (dev_nr * 6)
*/
if (counter < (dev_nr * dev_nr * 6))
return false;
return true;
}
/*
* Idle_counter is increased every time when update_writeback_rate() is
* called. If all backing devices attached to the same cache set have
* identical dc->writeback_rate_update_seconds values, it is about 6
* rounds of update_writeback_rate() on each backing device before
* c->at_max_writeback_rate is set to 1, and then max wrteback rate set
* to each dc->writeback_rate.rate.
* In order to avoid extra locking cost for counting exact dirty cached
* devices number, c->attached_dev_nr is used to calculate the idle
* throushold. It might be bigger if not all cached device are in write-
* back mode, but it still works well with limited extra rounds of
* update_writeback_rate().
*/
static bool set_at_max_writeback_rate(struct cache_set *c, static bool set_at_max_writeback_rate(struct cache_set *c,
struct cached_dev *dc) struct cached_dev *dc)
{ {
@@ -167,21 +214,8 @@ static bool set_at_max_writeback_rate(struct cache_set *c,
/* Don't set max writeback rate if gc is running */ /* Don't set max writeback rate if gc is running */
if (!c->gc_mark_valid) if (!c->gc_mark_valid)
return false; return false;
/*
* Idle_counter is increased everytime when update_writeback_rate() is if (!idle_counter_exceeded(c))
* called. If all backing devices attached to the same cache set have
* identical dc->writeback_rate_update_seconds values, it is about 6
* rounds of update_writeback_rate() on each backing device before
* c->at_max_writeback_rate is set to 1, and then max wrteback rate set
* to each dc->writeback_rate.rate.
* In order to avoid extra locking cost for counting exact dirty cached
* devices number, c->attached_dev_nr is used to calculate the idle
* throushold. It might be bigger if not all cached device are in write-
* back mode, but it still works well with limited extra rounds of
* update_writeback_rate().
*/
if (atomic_inc_return(&c->idle_counter) <
atomic_read(&c->attached_dev_nr) * 6)
return false; return false;
if (atomic_read(&c->at_max_writeback_rate) != 1) if (atomic_read(&c->at_max_writeback_rate) != 1)
@@ -195,13 +229,10 @@ static bool set_at_max_writeback_rate(struct cache_set *c,
dc->writeback_rate_change = 0; dc->writeback_rate_change = 0;
/* /*
* Check c->idle_counter and c->at_max_writeback_rate agagain in case * In case new I/O arrives during before
* new I/O arrives during before set_at_max_writeback_rate() returns. * set_at_max_writeback_rate() returns.
* Then the writeback rate is set to 1, and its new value should be
* decided via __update_writeback_rate().
*/ */
if ((atomic_read(&c->idle_counter) < if (!idle_counter_exceeded(c) ||
atomic_read(&c->attached_dev_nr) * 6) ||
!atomic_read(&c->at_max_writeback_rate)) !atomic_read(&c->at_max_writeback_rate))
return false; return false;
@@ -801,10 +832,9 @@ static int bch_writeback_thread(void *arg)
} }
} }
if (dc->writeback_write_wq) { if (dc->writeback_write_wq)
flush_workqueue(dc->writeback_write_wq);
destroy_workqueue(dc->writeback_write_wq); destroy_workqueue(dc->writeback_write_wq);
}
cached_dev_put(dc); cached_dev_put(dc);
wait_for_kthread_stop(); wait_for_kthread_stop();

View File

@@ -1856,9 +1856,7 @@ static bool dm_table_supports_write_zeroes(struct dm_table *t)
static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev, static int device_not_nowait_capable(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data) sector_t start, sector_t len, void *data)
{ {
struct request_queue *q = bdev_get_queue(dev->bdev); return !bdev_nowait(dev->bdev);
return !blk_queue_nowait(q);
} }
static bool dm_table_supports_nowait(struct dm_table *t) static bool dm_table_supports_nowait(struct dm_table *t)

View File

@@ -5845,7 +5845,7 @@ int md_run(struct mddev *mddev)
} }
} }
sysfs_notify_dirent_safe(rdev->sysfs_state); sysfs_notify_dirent_safe(rdev->sysfs_state);
nowait = nowait && blk_queue_nowait(bdev_get_queue(rdev->bdev)); nowait = nowait && bdev_nowait(rdev->bdev);
} }
if (!bioset_initialized(&mddev->bio_set)) { if (!bioset_initialized(&mddev->bio_set)) {
@@ -6982,7 +6982,7 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
* If the new disk does not support REQ_NOWAIT, * If the new disk does not support REQ_NOWAIT,
* disable on the whole MD. * disable on the whole MD.
*/ */
if (!blk_queue_nowait(bdev_get_queue(rdev->bdev))) { if (!bdev_nowait(rdev->bdev)) {
pr_info("%s: Disabling nowait because %pg does not support nowait\n", pr_info("%s: Disabling nowait because %pg does not support nowait\n",
mdname(mddev), rdev->bdev); mdname(mddev), rdev->bdev);
blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
@@ -8156,7 +8156,6 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
list_for_each(tmp,&all_mddevs) list_for_each(tmp,&all_mddevs)
if (!l--) { if (!l--) {
mddev = list_entry(tmp, struct mddev, all_mddevs); mddev = list_entry(tmp, struct mddev, all_mddevs);
mddev_get(mddev);
if (!mddev_get(mddev)) if (!mddev_get(mddev))
continue; continue;
spin_unlock(&all_mddevs_lock); spin_unlock(&all_mddevs_lock);

View File

@@ -47,7 +47,7 @@ static void dump_zones(struct mddev *mddev)
int len = 0; int len = 0;
for (k = 0; k < conf->strip_zone[j].nb_dev; k++) for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
len += snprintf(line+len, 200-len, "%s%pg", k?"/":"", len += scnprintf(line+len, 200-len, "%s%pg", k?"/":"",
conf->devlist[j * raid_disks + k]->bdev); conf->devlist[j * raid_disks + k]->bdev);
pr_debug("md: zone%d=[%s]\n", j, line); pr_debug("md: zone%d=[%s]\n", j, line);

View File

@@ -79,6 +79,21 @@ static void end_reshape(struct r10conf *conf);
#include "raid1-10.c" #include "raid1-10.c"
#define NULL_CMD
#define cmd_before(conf, cmd) \
do { \
write_sequnlock_irq(&(conf)->resync_lock); \
cmd; \
} while (0)
#define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock)
#define wait_event_barrier_cmd(conf, cond, cmd) \
wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \
cmd_after(conf))
#define wait_event_barrier(conf, cond) \
wait_event_barrier_cmd(conf, cond, NULL_CMD)
/* /*
* for resync bio, r10bio pointer can be retrieved from the per-bio * for resync bio, r10bio pointer can be retrieved from the per-bio
* 'struct resync_pages'. * 'struct resync_pages'.
@@ -274,6 +289,12 @@ static void put_buf(struct r10bio *r10_bio)
lower_barrier(conf); lower_barrier(conf);
} }
static void wake_up_barrier(struct r10conf *conf)
{
if (wq_has_sleeper(&conf->wait_barrier))
wake_up(&conf->wait_barrier);
}
static void reschedule_retry(struct r10bio *r10_bio) static void reschedule_retry(struct r10bio *r10_bio)
{ {
unsigned long flags; unsigned long flags;
@@ -930,78 +951,101 @@ static void flush_pending_writes(struct r10conf *conf)
static void raise_barrier(struct r10conf *conf, int force) static void raise_barrier(struct r10conf *conf, int force)
{ {
write_seqlock_irq(&conf->resync_lock);
BUG_ON(force && !conf->barrier); BUG_ON(force && !conf->barrier);
spin_lock_irq(&conf->resync_lock);
/* Wait until no block IO is waiting (unless 'force') */ /* Wait until no block IO is waiting (unless 'force') */
wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, wait_event_barrier(conf, force || !conf->nr_waiting);
conf->resync_lock);
/* block any new IO from starting */ /* block any new IO from starting */
conf->barrier++; WRITE_ONCE(conf->barrier, conf->barrier + 1);
/* Now wait for all pending IO to complete */ /* Now wait for all pending IO to complete */
wait_event_lock_irq(conf->wait_barrier, wait_event_barrier(conf, !atomic_read(&conf->nr_pending) &&
!atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH, conf->barrier < RESYNC_DEPTH);
conf->resync_lock);
spin_unlock_irq(&conf->resync_lock); write_sequnlock_irq(&conf->resync_lock);
} }
static void lower_barrier(struct r10conf *conf) static void lower_barrier(struct r10conf *conf)
{ {
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&conf->resync_lock, flags);
conf->barrier--; write_seqlock_irqsave(&conf->resync_lock, flags);
spin_unlock_irqrestore(&conf->resync_lock, flags); WRITE_ONCE(conf->barrier, conf->barrier - 1);
write_sequnlock_irqrestore(&conf->resync_lock, flags);
wake_up(&conf->wait_barrier); wake_up(&conf->wait_barrier);
} }
static bool stop_waiting_barrier(struct r10conf *conf)
{
struct bio_list *bio_list = current->bio_list;
/* barrier is dropped */
if (!conf->barrier)
return true;
/*
* If there are already pending requests (preventing the barrier from
* rising completely), and the pre-process bio queue isn't empty, then
* don't wait, as we need to empty that queue to get the nr_pending
* count down.
*/
if (atomic_read(&conf->nr_pending) && bio_list &&
(!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
return true;
/* move on if recovery thread is blocked by us */
if (conf->mddev->thread->tsk == current &&
test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) &&
conf->nr_queued > 0)
return true;
return false;
}
static bool wait_barrier_nolock(struct r10conf *conf)
{
unsigned int seq = read_seqbegin(&conf->resync_lock);
if (READ_ONCE(conf->barrier))
return false;
atomic_inc(&conf->nr_pending);
if (!read_seqretry(&conf->resync_lock, seq))
return true;
if (atomic_dec_and_test(&conf->nr_pending))
wake_up_barrier(conf);
return false;
}
static bool wait_barrier(struct r10conf *conf, bool nowait) static bool wait_barrier(struct r10conf *conf, bool nowait)
{ {
bool ret = true; bool ret = true;
spin_lock_irq(&conf->resync_lock); if (wait_barrier_nolock(conf))
return true;
write_seqlock_irq(&conf->resync_lock);
if (conf->barrier) { if (conf->barrier) {
struct bio_list *bio_list = current->bio_list;
conf->nr_waiting++;
/* Wait for the barrier to drop.
* However if there are already pending
* requests (preventing the barrier from
* rising completely), and the
* pre-process bio queue isn't empty,
* then don't wait, as we need to empty
* that queue to get the nr_pending
* count down.
*/
/* Return false when nowait flag is set */ /* Return false when nowait flag is set */
if (nowait) { if (nowait) {
ret = false; ret = false;
} else { } else {
conf->nr_waiting++;
raid10_log(conf->mddev, "wait barrier"); raid10_log(conf->mddev, "wait barrier");
wait_event_lock_irq(conf->wait_barrier, wait_event_barrier(conf, stop_waiting_barrier(conf));
!conf->barrier || conf->nr_waiting--;
(atomic_read(&conf->nr_pending) &&
bio_list &&
(!bio_list_empty(&bio_list[0]) ||
!bio_list_empty(&bio_list[1]))) ||
/* move on if recovery thread is
* blocked by us
*/
(conf->mddev->thread->tsk == current &&
test_bit(MD_RECOVERY_RUNNING,
&conf->mddev->recovery) &&
conf->nr_queued > 0),
conf->resync_lock);
} }
conf->nr_waiting--;
if (!conf->nr_waiting) if (!conf->nr_waiting)
wake_up(&conf->wait_barrier); wake_up(&conf->wait_barrier);
} }
/* Only increment nr_pending when we wait */ /* Only increment nr_pending when we wait */
if (ret) if (ret)
atomic_inc(&conf->nr_pending); atomic_inc(&conf->nr_pending);
spin_unlock_irq(&conf->resync_lock); write_sequnlock_irq(&conf->resync_lock);
return ret; return ret;
} }
@@ -1009,7 +1053,7 @@ static void allow_barrier(struct r10conf *conf)
{ {
if ((atomic_dec_and_test(&conf->nr_pending)) || if ((atomic_dec_and_test(&conf->nr_pending)) ||
(conf->array_freeze_pending)) (conf->array_freeze_pending))
wake_up(&conf->wait_barrier); wake_up_barrier(conf);
} }
static void freeze_array(struct r10conf *conf, int extra) static void freeze_array(struct r10conf *conf, int extra)
@@ -1026,27 +1070,24 @@ static void freeze_array(struct r10conf *conf, int extra)
* must match the number of pending IOs (nr_pending) before * must match the number of pending IOs (nr_pending) before
* we continue. * we continue.
*/ */
spin_lock_irq(&conf->resync_lock); write_seqlock_irq(&conf->resync_lock);
conf->array_freeze_pending++; conf->array_freeze_pending++;
conf->barrier++; WRITE_ONCE(conf->barrier, conf->barrier + 1);
conf->nr_waiting++; conf->nr_waiting++;
wait_event_lock_irq_cmd(conf->wait_barrier, wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) ==
atomic_read(&conf->nr_pending) == conf->nr_queued+extra, conf->nr_queued + extra, flush_pending_writes(conf));
conf->resync_lock,
flush_pending_writes(conf));
conf->array_freeze_pending--; conf->array_freeze_pending--;
spin_unlock_irq(&conf->resync_lock); write_sequnlock_irq(&conf->resync_lock);
} }
static void unfreeze_array(struct r10conf *conf) static void unfreeze_array(struct r10conf *conf)
{ {
/* reverse the effect of the freeze */ /* reverse the effect of the freeze */
spin_lock_irq(&conf->resync_lock); write_seqlock_irq(&conf->resync_lock);
conf->barrier--; WRITE_ONCE(conf->barrier, conf->barrier - 1);
conf->nr_waiting--; conf->nr_waiting--;
wake_up(&conf->wait_barrier); wake_up(&conf->wait_barrier);
spin_unlock_irq(&conf->resync_lock); write_sequnlock_irq(&conf->resync_lock);
} }
static sector_t choose_data_offset(struct r10bio *r10_bio, static sector_t choose_data_offset(struct r10bio *r10_bio,
@@ -1885,7 +1926,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
__make_request(mddev, bio, sectors); __make_request(mddev, bio, sectors);
/* In case raid10d snuck in to freeze_array */ /* In case raid10d snuck in to freeze_array */
wake_up(&conf->wait_barrier); wake_up_barrier(conf);
return true; return true;
} }
@@ -1980,7 +2021,7 @@ static int enough(struct r10conf *conf, int ignore)
* Otherwise, it must be degraded: * Otherwise, it must be degraded:
* - recovery is interrupted. * - recovery is interrupted.
* - &mddev->degraded is bumped. * - &mddev->degraded is bumped.
*
* @rdev is marked as &Faulty excluding case when array is failed and * @rdev is marked as &Faulty excluding case when array is failed and
* &mddev->fail_last_dev is off. * &mddev->fail_last_dev is off.
*/ */
@@ -4032,7 +4073,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list); INIT_LIST_HEAD(&conf->bio_end_io_list);
spin_lock_init(&conf->resync_lock); seqlock_init(&conf->resync_lock);
init_waitqueue_head(&conf->wait_barrier); init_waitqueue_head(&conf->wait_barrier);
atomic_set(&conf->nr_pending, 0); atomic_set(&conf->nr_pending, 0);
@@ -4351,7 +4392,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
rdev->new_raid_disk = rdev->raid_disk * 2; rdev->new_raid_disk = rdev->raid_disk * 2;
rdev->sectors = size; rdev->sectors = size;
} }
conf->barrier = 1; WRITE_ONCE(conf->barrier, 1);
} }
return conf; return conf;

View File

@@ -76,7 +76,7 @@ struct r10conf {
/* queue pending writes and submit them on unplug */ /* queue pending writes and submit them on unplug */
struct bio_list pending_bio_list; struct bio_list pending_bio_list;
spinlock_t resync_lock; seqlock_t resync_lock;
atomic_t nr_pending; atomic_t nr_pending;
int nr_waiting; int nr_waiting;
int nr_queued; int nr_queued;

View File

@@ -125,7 +125,7 @@ struct r5l_log {
* reclaimed. if it's 0, reclaim spaces * reclaimed. if it's 0, reclaim spaces
* used by io_units which are in * used by io_units which are in
* IO_UNIT_STRIPE_END state (eg, reclaim * IO_UNIT_STRIPE_END state (eg, reclaim
* dones't wait for specific io_unit * doesn't wait for specific io_unit
* switching to IO_UNIT_STRIPE_END * switching to IO_UNIT_STRIPE_END
* state) */ * state) */
wait_queue_head_t iounit_wait; wait_queue_head_t iounit_wait;
@@ -1327,9 +1327,9 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
* superblock is updated to new log tail. Updating superblock (either * superblock is updated to new log tail. Updating superblock (either
* directly call md_update_sb() or depend on md thread) must hold * directly call md_update_sb() or depend on md thread) must hold
* reconfig mutex. On the other hand, raid5_quiesce is called with * reconfig mutex. On the other hand, raid5_quiesce is called with
* reconfig_mutex hold. The first step of raid5_quiesce() is waitting * reconfig_mutex hold. The first step of raid5_quiesce() is waiting
* for all IO finish, hence waitting for reclaim thread, while reclaim * for all IO finish, hence waiting for reclaim thread, while reclaim
* thread is calling this function and waitting for reconfig mutex. So * thread is calling this function and waiting for reconfig mutex. So
* there is a deadlock. We workaround this issue with a trylock. * there is a deadlock. We workaround this issue with a trylock.
* FIXME: we could miss discard if we can't take reconfig mutex * FIXME: we could miss discard if we can't take reconfig mutex
*/ */
@@ -1923,7 +1923,8 @@ r5c_recovery_alloc_stripe(
{ {
struct stripe_head *sh; struct stripe_head *sh;
sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0); sh = raid5_get_active_stripe(conf, NULL, stripe_sect,
noblock ? R5_GAS_NOBLOCK : 0);
if (!sh) if (!sh)
return NULL; /* no more stripe available */ return NULL; /* no more stripe available */

View File

@@ -36,6 +36,7 @@
*/ */
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include <linux/raid/pq.h> #include <linux/raid/pq.h>
#include <linux/async_tx.h> #include <linux/async_tx.h>
@@ -789,87 +790,80 @@ struct stripe_request_ctx {
*/ */
static bool is_inactive_blocked(struct r5conf *conf, int hash) static bool is_inactive_blocked(struct r5conf *conf, int hash)
{ {
int active = atomic_read(&conf->active_stripes);
if (list_empty(conf->inactive_list + hash)) if (list_empty(conf->inactive_list + hash))
return false; return false;
if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
return true; return true;
return active < (conf->max_nr_stripes * 3 / 4); return (atomic_read(&conf->active_stripes) <
(conf->max_nr_stripes * 3 / 4));
} }
static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf, struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
struct stripe_request_ctx *ctx, sector_t sector, struct stripe_request_ctx *ctx, sector_t sector,
bool previous, bool noblock, bool noquiesce) unsigned int flags)
{ {
struct stripe_head *sh; struct stripe_head *sh;
int hash = stripe_hash_locks_hash(conf, sector); int hash = stripe_hash_locks_hash(conf, sector);
int previous = !!(flags & R5_GAS_PREVIOUS);
pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
spin_lock_irq(conf->hash_locks + hash); spin_lock_irq(conf->hash_locks + hash);
retry: for (;;) {
if (!noquiesce && conf->quiesce) { if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) {
/* /*
* Must release the reference to batch_last before waiting, * Must release the reference to batch_last before
* on quiesce, otherwise the batch_last will hold a reference * waiting, on quiesce, otherwise the batch_last will
* to a stripe and raid5_quiesce() will deadlock waiting for * hold a reference to a stripe and raid5_quiesce()
* active_stripes to go to zero. * will deadlock waiting for active_stripes to go to
*/ * zero.
if (ctx && ctx->batch_last) { */
raid5_release_stripe(ctx->batch_last); if (ctx && ctx->batch_last) {
ctx->batch_last = NULL; raid5_release_stripe(ctx->batch_last);
ctx->batch_last = NULL;
}
wait_event_lock_irq(conf->wait_for_quiescent,
!conf->quiesce,
*(conf->hash_locks + hash));
} }
wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce, sh = find_get_stripe(conf, sector, conf->generation - previous,
hash);
if (sh)
break;
if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
sh = get_free_stripe(conf, hash);
if (sh) {
r5c_check_stripe_cache_usage(conf);
init_stripe(sh, sector, previous);
atomic_inc(&sh->count);
break;
}
if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
set_bit(R5_ALLOC_MORE, &conf->cache_state);
}
if (flags & R5_GAS_NOBLOCK)
break;
set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
r5l_wake_reclaim(conf->log, 0);
wait_event_lock_irq(conf->wait_for_stripe,
is_inactive_blocked(conf, hash),
*(conf->hash_locks + hash)); *(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
} }
sh = find_get_stripe(conf, sector, conf->generation - previous, hash);
if (sh)
goto out;
if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
goto wait_for_stripe;
sh = get_free_stripe(conf, hash);
if (sh) {
r5c_check_stripe_cache_usage(conf);
init_stripe(sh, sector, previous);
atomic_inc(&sh->count);
goto out;
}
if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
set_bit(R5_ALLOC_MORE, &conf->cache_state);
wait_for_stripe:
if (noblock)
goto out;
set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
r5l_wake_reclaim(conf->log, 0);
wait_event_lock_irq(conf->wait_for_stripe,
is_inactive_blocked(conf, hash),
*(conf->hash_locks + hash));
clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
goto retry;
out:
spin_unlock_irq(conf->hash_locks + hash); spin_unlock_irq(conf->hash_locks + hash);
return sh; return sh;
} }
struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
sector_t sector, bool previous, bool noblock, bool noquiesce)
{
return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock,
noquiesce);
}
static bool is_full_stripe_write(struct stripe_head *sh) static bool is_full_stripe_write(struct stripe_head *sh)
{ {
BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded)); BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
@@ -4047,7 +4041,7 @@ static void handle_stripe_fill(struct stripe_head *sh,
* back cache (prexor with orig_page, and then xor with * back cache (prexor with orig_page, and then xor with
* page) in the read path * page) in the read path
*/ */
if (s->injournal && s->failed) { if (s->to_read && s->injournal && s->failed) {
if (test_bit(STRIPE_R5C_CACHING, &sh->state)) if (test_bit(STRIPE_R5C_CACHING, &sh->state))
r5c_make_stripe_write_out(sh); r5c_make_stripe_write_out(sh);
goto out; goto out;
@@ -4636,7 +4630,8 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
sector_t bn = raid5_compute_blocknr(sh, i, 1); sector_t bn = raid5_compute_blocknr(sh, i, 1);
sector_t s = raid5_compute_sector(conf, bn, 0, sector_t s = raid5_compute_sector(conf, bn, 0,
&dd_idx, NULL); &dd_idx, NULL);
sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1); sh2 = raid5_get_active_stripe(conf, NULL, s,
R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE);
if (sh2 == NULL) if (sh2 == NULL)
/* so far only the early blocks of this stripe /* so far only the early blocks of this stripe
* have been requested. When later blocks * have been requested. When later blocks
@@ -5273,7 +5268,9 @@ static void handle_stripe(struct stripe_head *sh)
/* Finish reconstruct operations initiated by the expansion process */ /* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) { if (sh->reconstruct_state == reconstruct_state_result) {
struct stripe_head *sh_src struct stripe_head *sh_src
= raid5_get_active_stripe(conf, sh->sector, 1, 1, 1); = raid5_get_active_stripe(conf, NULL, sh->sector,
R5_GAS_PREVIOUS | R5_GAS_NOBLOCK |
R5_GAS_NOQUIESCE);
if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
/* sh cannot be written until sh_src has been read. /* sh cannot be written until sh_src has been read.
* so arrange for sh to be delayed a little * so arrange for sh to be delayed a little
@@ -5542,7 +5539,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
&bad_sectors)) { &bad_sectors)) {
bio_put(raid_bio);
rdev_dec_pending(rdev, mddev); rdev_dec_pending(rdev, mddev);
return 0; return 0;
} }
@@ -5823,7 +5819,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
DEFINE_WAIT(w); DEFINE_WAIT(w);
int d; int d;
again: again:
sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0); sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0);
prepare_to_wait(&conf->wait_for_overlap, &w, prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
@@ -5978,7 +5974,7 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
enum stripe_result ret; enum stripe_result ret;
struct stripe_head *sh; struct stripe_head *sh;
sector_t new_sector; sector_t new_sector;
int previous = 0; int previous = 0, flags = 0;
int seq, dd_idx; int seq, dd_idx;
seq = read_seqcount_begin(&conf->gen_lock); seq = read_seqcount_begin(&conf->gen_lock);
@@ -6012,8 +6008,11 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
pr_debug("raid456: %s, sector %llu logical %llu\n", __func__, pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
new_sector, logical_sector); new_sector, logical_sector);
sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous, if (previous)
(bi->bi_opf & REQ_RAHEAD), 0); flags |= R5_GAS_PREVIOUS;
if (bi->bi_opf & REQ_RAHEAD)
flags |= R5_GAS_NOBLOCK;
sh = raid5_get_active_stripe(conf, ctx, new_sector, flags);
if (unlikely(!sh)) { if (unlikely(!sh)) {
/* cannot get stripe, just give-up */ /* cannot get stripe, just give-up */
bi->bi_status = BLK_STS_IOERR; bi->bi_status = BLK_STS_IOERR;
@@ -6362,7 +6361,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
int j; int j;
int skipped_disk = 0; int skipped_disk = 0;
sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); sh = raid5_get_active_stripe(conf, NULL, stripe_addr+i,
R5_GAS_NOQUIESCE);
set_bit(STRIPE_EXPANDING, &sh->state); set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes); atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old /* If any of this stripe is beyond the end of the old
@@ -6411,7 +6411,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (last_sector >= mddev->dev_sectors) if (last_sector >= mddev->dev_sectors)
last_sector = mddev->dev_sectors - 1; last_sector = mddev->dev_sectors - 1;
while (first_sector <= last_sector) { while (first_sector <= last_sector) {
sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1); sh = raid5_get_active_stripe(conf, NULL, first_sector,
R5_GAS_PREVIOUS | R5_GAS_NOQUIESCE);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh); raid5_release_stripe(sh);
@@ -6531,9 +6532,10 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0); sh = raid5_get_active_stripe(conf, NULL, sector_nr,
R5_GAS_NOBLOCK);
if (sh == NULL) { if (sh == NULL) {
sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0); sh = raid5_get_active_stripe(conf, NULL, sector_nr, 0);
/* make sure we don't swamp the stripe cache if someone else /* make sure we don't swamp the stripe cache if someone else
* is trying to get access * is trying to get access
*/ */
@@ -6596,8 +6598,8 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
/* already done this stripe */ /* already done this stripe */
continue; continue;
sh = raid5_get_active_stripe(conf, sector, 0, 1, 1); sh = raid5_get_active_stripe(conf, NULL, sector,
R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE);
if (!sh) { if (!sh) {
/* failed to get a stripe - must wait */ /* failed to get a stripe - must wait */
conf->retry_read_aligned = raid_bio; conf->retry_read_aligned = raid_bio;
@@ -6781,7 +6783,18 @@ static void raid5d(struct md_thread *thread)
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev); md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
/*
* Waiting on MD_SB_CHANGE_PENDING below may deadlock
* seeing md_check_recovery() is needed to clear
* the flag when using mdmon.
*/
continue;
} }
wait_event_lock_irq(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
conf->device_lock);
} }
pr_debug("%d stripes handled\n", handled); pr_debug("%d stripes handled\n", handled);

View File

@@ -803,16 +803,24 @@ raid5_get_dev_page(struct stripe_head *sh, int disk_idx)
} }
#endif #endif
extern void md_raid5_kick_device(struct r5conf *conf); void md_raid5_kick_device(struct r5conf *conf);
extern int raid5_set_cache_size(struct mddev *mddev, int size); int raid5_set_cache_size(struct mddev *mddev, int size);
extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous); sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
extern void raid5_release_stripe(struct stripe_head *sh); void raid5_release_stripe(struct stripe_head *sh);
extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
int previous, int *dd_idx, int previous, int *dd_idx, struct stripe_head *sh);
struct stripe_head *sh);
extern struct stripe_head * struct stripe_request_ctx;
raid5_get_active_stripe(struct r5conf *conf, sector_t sector, /* get stripe from previous generation (when reshaping) */
bool previous, bool noblock, bool noquiesce); #define R5_GAS_PREVIOUS (1 << 0)
extern int raid5_calc_degraded(struct r5conf *conf); /* do not block waiting for a free stripe */
extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode); #define R5_GAS_NOBLOCK (1 << 1)
/* do not block waiting for quiesce to be released */
#define R5_GAS_NOQUIESCE (1 << 2)
struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
struct stripe_request_ctx *ctx, sector_t sector,
unsigned int flags);
int raid5_calc_degraded(struct r5conf *conf);
int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
#endif #endif

View File

@@ -1111,8 +1111,8 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
return effects; return effects;
} }
static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects, void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
struct nvme_command *cmd, int status) struct nvme_command *cmd, int status)
{ {
if (effects & NVME_CMD_EFFECTS_CSE_MASK) { if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
nvme_unfreeze(ctrl); nvme_unfreeze(ctrl);
@@ -1148,21 +1148,16 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
break; break;
} }
} }
EXPORT_SYMBOL_NS_GPL(nvme_passthru_end, NVME_TARGET_PASSTHRU);
int nvme_execute_passthru_rq(struct request *rq) int nvme_execute_passthru_rq(struct request *rq, u32 *effects)
{ {
struct nvme_command *cmd = nvme_req(rq)->cmd; struct nvme_command *cmd = nvme_req(rq)->cmd;
struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl; struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
struct nvme_ns *ns = rq->q->queuedata; struct nvme_ns *ns = rq->q->queuedata;
u32 effects;
int ret;
effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode); *effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
ret = nvme_execute_rq(rq, false); return nvme_execute_rq(rq, false);
if (effects) /* nothing to be done for zero cmd effects */
nvme_passthru_end(ctrl, effects, cmd, ret);
return ret;
} }
EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU); EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
@@ -2696,7 +2691,7 @@ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ct
if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) { if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) {
nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE);
if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) {
strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE);
return; return;
} }
@@ -2704,7 +2699,11 @@ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ct
dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n");
} }
/* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ /*
* Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe
* Base Specification 2.0. It is slightly different from the format
* specified there due to historic reasons, and we can't change it now.
*/
off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, off = snprintf(subsys->subnqn, NVMF_NQN_SIZE,
"nqn.2014.08.org.nvmexpress:%04x%04x", "nqn.2014.08.org.nvmexpress:%04x%04x",
le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); le16_to_cpu(id->vid), le16_to_cpu(id->ssvid));
@@ -2894,7 +2893,6 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
nvme_init_subnqn(subsys, ctrl, id); nvme_init_subnqn(subsys, ctrl, id);
memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); memcpy(subsys->serial, id->sn, sizeof(subsys->serial));
memcpy(subsys->model, id->mn, sizeof(subsys->model)); memcpy(subsys->model, id->mn, sizeof(subsys->model));
memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
subsys->vendor_id = le16_to_cpu(id->vid); subsys->vendor_id = le16_to_cpu(id->vid);
subsys->cmic = id->cmic; subsys->cmic = id->cmic;
@@ -3113,6 +3111,8 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->quirks |= core_quirks[i].quirks; ctrl->quirks |= core_quirks[i].quirks;
} }
} }
memcpy(ctrl->subsys->firmware_rev, id->fr,
sizeof(ctrl->subsys->firmware_rev));
if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) { if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n"); dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
@@ -4805,6 +4805,108 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
} }
EXPORT_SYMBOL_GPL(nvme_complete_async_event); EXPORT_SYMBOL_GPL(nvme_complete_async_event);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int flags,
unsigned int cmd_size)
{
int ret;
memset(set, 0, sizeof(*set));
set->ops = ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
if (ctrl->ops->flags & NVME_F_FABRICS)
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = ctrl->numa_node;
set->flags = flags;
set->cmd_size = cmd_size;
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = NVME_ADMIN_TIMEOUT;
ret = blk_mq_alloc_tag_set(set);
if (ret)
return ret;
ctrl->admin_q = blk_mq_init_queue(set);
if (IS_ERR(ctrl->admin_q)) {
ret = PTR_ERR(ctrl->admin_q);
goto out_free_tagset;
}
if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->fabrics_q = blk_mq_init_queue(set);
if (IS_ERR(ctrl->fabrics_q)) {
ret = PTR_ERR(ctrl->fabrics_q);
goto out_cleanup_admin_q;
}
}
ctrl->admin_tagset = set;
return 0;
out_cleanup_admin_q:
blk_mq_destroy_queue(ctrl->fabrics_q);
out_free_tagset:
blk_mq_free_tag_set(ctrl->admin_tagset);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
{
blk_mq_destroy_queue(ctrl->admin_q);
if (ctrl->ops->flags & NVME_F_FABRICS)
blk_mq_destroy_queue(ctrl->fabrics_q);
blk_mq_free_tag_set(ctrl->admin_tagset);
}
EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int flags,
unsigned int cmd_size)
{
int ret;
memset(set, 0, sizeof(*set));
set->ops = ops;
set->queue_depth = ctrl->sqsize + 1;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = ctrl->numa_node;
set->flags = flags;
set->cmd_size = cmd_size,
set->driver_data = ctrl;
set->nr_hw_queues = ctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
if (ops->map_queues)
set->nr_maps = ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
ret = blk_mq_alloc_tag_set(set);
if (ret)
return ret;
if (ctrl->ops->flags & NVME_F_FABRICS) {
ctrl->connect_q = blk_mq_init_queue(set);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
}
ctrl->tagset = set;
return 0;
out_free_tag_set:
blk_mq_free_tag_set(set);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
{
if (ctrl->ops->flags & NVME_F_FABRICS)
blk_mq_destroy_queue(ctrl->connect_q);
blk_mq_free_tag_set(ctrl->tagset);
}
EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl) void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{ {
nvme_mpath_stop(ctrl); nvme_mpath_stop(ctrl);
@@ -4824,6 +4926,16 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
nvme_enable_aen(ctrl); nvme_enable_aen(ctrl);
/*
* persistent discovery controllers need to send indication to userspace
* to re-read the discovery log page to learn about possible changes
* that were missed. We identify persistent discovery controllers by
* checking that they started once before, hence are reconnecting back.
*/
if (test_and_set_bit(NVME_CTRL_STARTED_ONCE, &ctrl->flags) &&
nvme_discovery_ctrl(ctrl))
nvme_change_uevent(ctrl, "NVME_EVENT=rediscover");
if (ctrl->queue_count > 1) { if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl); nvme_queue_scan(ctrl);
nvme_start_queues(ctrl); nvme_start_queues(ctrl);

View File

@@ -49,7 +49,7 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn)
goto out_unlock; goto out_unlock;
kref_init(&host->ref); kref_init(&host->ref);
strlcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); strscpy(host->nqn, hostnqn, NVMF_NQN_SIZE);
list_add_tail(&host->list, &nvmf_hosts); list_add_tail(&host->list, &nvmf_hosts);
out_unlock: out_unlock:
@@ -971,13 +971,17 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
return false; return false;
/* /*
* Checking the local address is rough. In most cases, none is specified * Checking the local address or host interfaces is rough.
* and the host port is selected by the stack. *
* In most cases, none is specified and the host port or
* host interface is selected by the stack.
* *
* Assume no match if: * Assume no match if:
* - local address is specified and address is not the same * - local address or host interface is specified and address
* - local address is not specified but remote is, or vice versa * or host interface is not the same
* (admin using specific host_traddr when it matters). * - local address or host interface is not specified but
* remote is, or vice versa (admin using specific
* host_traddr/host_iface when it matters).
*/ */
if ((opts->mask & NVMF_OPT_HOST_TRADDR) && if ((opts->mask & NVMF_OPT_HOST_TRADDR) &&
(ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)) { (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)) {
@@ -988,6 +992,15 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
return false; return false;
} }
if ((opts->mask & NVMF_OPT_HOST_IFACE) &&
(ctrl->opts->mask & NVMF_OPT_HOST_IFACE)) {
if (strcmp(opts->host_iface, ctrl->opts->host_iface))
return false;
} else if ((opts->mask & NVMF_OPT_HOST_IFACE) ||
(ctrl->opts->mask & NVMF_OPT_HOST_IFACE)) {
return false;
}
return true; return true;
} }
EXPORT_SYMBOL_GPL(nvmf_ip_options_match); EXPORT_SYMBOL_GPL(nvmf_ip_options_match);

View File

@@ -1829,7 +1829,7 @@ nvme_fc_exit_request(struct blk_mq_tag_set *set, struct request *rq,
{ {
struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
return __nvme_fc_exit_request(set->driver_data, op); return __nvme_fc_exit_request(to_fc_ctrl(set->driver_data), op);
} }
static int static int
@@ -2135,7 +2135,7 @@ static int
nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq, nvme_fc_init_request(struct blk_mq_tag_set *set, struct request *rq,
unsigned int hctx_idx, unsigned int numa_node) unsigned int hctx_idx, unsigned int numa_node)
{ {
struct nvme_fc_ctrl *ctrl = set->driver_data; struct nvme_fc_ctrl *ctrl = to_fc_ctrl(set->driver_data);
struct nvme_fcp_op_w_sgl *op = blk_mq_rq_to_pdu(rq); struct nvme_fcp_op_w_sgl *op = blk_mq_rq_to_pdu(rq);
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_fc_queue *queue = &ctrl->queues[queue_idx]; struct nvme_fc_queue *queue = &ctrl->queues[queue_idx];
@@ -2206,36 +2206,28 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl)
} }
} }
static inline void static inline int
__nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, struct nvme_fc_ctrl *ctrl, __nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int qidx)
unsigned int qidx)
{ {
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(data);
struct nvme_fc_queue *queue = &ctrl->queues[qidx]; struct nvme_fc_queue *queue = &ctrl->queues[qidx];
hctx->driver_data = queue; hctx->driver_data = queue;
queue->hctx = hctx; queue->hctx = hctx;
return 0;
} }
static int static int
nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, nvme_fc_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx)
unsigned int hctx_idx)
{ {
struct nvme_fc_ctrl *ctrl = data; return __nvme_fc_init_hctx(hctx, data, hctx_idx + 1);
__nvme_fc_init_hctx(hctx, ctrl, hctx_idx + 1);
return 0;
} }
static int static int
nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct nvme_fc_ctrl *ctrl = data; return __nvme_fc_init_hctx(hctx, data, hctx_idx);
__nvme_fc_init_hctx(hctx, ctrl, hctx_idx);
return 0;
} }
static void static void
@@ -2391,10 +2383,8 @@ nvme_fc_ctrl_free(struct kref *ref)
container_of(ref, struct nvme_fc_ctrl, ref); container_of(ref, struct nvme_fc_ctrl, ref);
unsigned long flags; unsigned long flags;
if (ctrl->ctrl.tagset) { if (ctrl->ctrl.tagset)
blk_mq_destroy_queue(ctrl->ctrl.connect_q); nvme_remove_io_tag_set(&ctrl->ctrl);
blk_mq_free_tag_set(&ctrl->tag_set);
}
/* remove from rport list */ /* remove from rport list */
spin_lock_irqsave(&ctrl->rport->lock, flags); spin_lock_irqsave(&ctrl->rport->lock, flags);
@@ -2402,9 +2392,7 @@ nvme_fc_ctrl_free(struct kref *ref)
spin_unlock_irqrestore(&ctrl->rport->lock, flags); spin_unlock_irqrestore(&ctrl->rport->lock, flags);
nvme_start_admin_queue(&ctrl->ctrl); nvme_start_admin_queue(&ctrl->ctrl);
blk_mq_destroy_queue(ctrl->ctrl.admin_q); nvme_remove_admin_tag_set(&ctrl->ctrl);
blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(&ctrl->admin_tag_set);
kfree(ctrl->queues); kfree(ctrl->queues);
@@ -2860,9 +2848,9 @@ nvme_fc_complete_rq(struct request *rq)
nvme_fc_ctrl_put(ctrl); nvme_fc_ctrl_put(ctrl);
} }
static int nvme_fc_map_queues(struct blk_mq_tag_set *set) static void nvme_fc_map_queues(struct blk_mq_tag_set *set)
{ {
struct nvme_fc_ctrl *ctrl = set->driver_data; struct nvme_fc_ctrl *ctrl = to_fc_ctrl(set->driver_data);
int i; int i;
for (i = 0; i < set->nr_maps; i++) { for (i = 0; i < set->nr_maps; i++) {
@@ -2880,7 +2868,6 @@ static int nvme_fc_map_queues(struct blk_mq_tag_set *set)
else else
blk_mq_map_queues(map); blk_mq_map_queues(map);
} }
return 0;
} }
static const struct blk_mq_ops nvme_fc_mq_ops = { static const struct blk_mq_ops nvme_fc_mq_ops = {
@@ -2915,32 +2902,16 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
nvme_fc_init_io_queues(ctrl); nvme_fc_init_io_queues(ctrl);
memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set,
ctrl->tag_set.ops = &nvme_fc_mq_ops; &nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE,
ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS; ctrl->lport->ops->fcprqst_priv_sz));
ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ctrl->tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->lport->ops->fcprqst_priv_sz);
ctrl->tag_set.driver_data = ctrl;
ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1;
ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
if (ret) if (ret)
return ret; return ret;
ctrl->ctrl.tagset = &ctrl->tag_set;
ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl));
if (ret)
goto out_free_tag_set;
ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1);
if (ret) if (ret)
goto out_cleanup_blk_queue; goto out_cleanup_tagset;
ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1); ret = nvme_fc_connect_io_queues(ctrl, ctrl->ctrl.sqsize + 1);
if (ret) if (ret)
@@ -2952,10 +2923,8 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
out_delete_hw_queues: out_delete_hw_queues:
nvme_fc_delete_hw_io_queues(ctrl); nvme_fc_delete_hw_io_queues(ctrl);
out_cleanup_blk_queue: out_cleanup_tagset:
blk_mq_destroy_queue(ctrl->ctrl.connect_q); nvme_remove_io_tag_set(&ctrl->ctrl);
out_free_tag_set:
blk_mq_free_tag_set(&ctrl->tag_set);
nvme_fc_free_io_queues(ctrl); nvme_fc_free_io_queues(ctrl);
/* force put free routine to ignore io queues */ /* force put free routine to ignore io queues */
@@ -3166,15 +3135,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
"to maxcmd\n", "to maxcmd\n",
opts->queue_size, ctrl->ctrl.maxcmd); opts->queue_size, ctrl->ctrl.maxcmd);
opts->queue_size = ctrl->ctrl.maxcmd; opts->queue_size = ctrl->ctrl.maxcmd;
} ctrl->ctrl.sqsize = opts->queue_size - 1;
if (opts->queue_size > ctrl->ctrl.sqsize + 1) {
/* warn if sqsize is lower than queue_size */
dev_warn(ctrl->ctrl.device,
"queue_size %zu > ctrl sqsize %u, reducing "
"to sqsize\n",
opts->queue_size, ctrl->ctrl.sqsize + 1);
opts->queue_size = ctrl->ctrl.sqsize + 1;
} }
ret = nvme_fc_init_aen_ops(ctrl); ret = nvme_fc_init_aen_ops(ctrl);
@@ -3547,35 +3508,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
nvme_fc_init_queue(ctrl, 0); nvme_fc_init_queue(ctrl, 0);
memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; &nvme_fc_admin_mq_ops, BLK_MQ_F_NO_SCHED,
ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS; ctrl->lport->ops->fcprqst_priv_sz));
ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->admin_tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->lport->ops->fcprqst_priv_sz);
ctrl->admin_tag_set.driver_data = ctrl;
ctrl->admin_tag_set.nr_hw_queues = 1;
ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT;
ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
if (ret) if (ret)
goto out_free_queues; goto out_free_queues;
ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
ret = PTR_ERR(ctrl->ctrl.fabrics_q);
goto out_free_admin_tag_set;
}
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
ret = PTR_ERR(ctrl->ctrl.admin_q);
goto out_cleanup_fabrics_q;
}
/* /*
* Would have been nice to init io queues tag set as well. * Would have been nice to init io queues tag set as well.
@@ -3586,7 +3524,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0); ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_fc_ctrl_ops, 0);
if (ret) if (ret)
goto out_cleanup_admin_q; goto out_cleanup_tagset;
/* at this point, teardown path changes to ref counting on nvme ctrl */ /* at this point, teardown path changes to ref counting on nvme ctrl */
@@ -3641,12 +3579,8 @@ fail_ctrl:
return ERR_PTR(-EIO); return ERR_PTR(-EIO);
out_cleanup_admin_q: out_cleanup_tagset:
blk_mq_destroy_queue(ctrl->ctrl.admin_q); nvme_remove_admin_tag_set(&ctrl->ctrl);
out_cleanup_fabrics_q:
blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
out_free_admin_tag_set:
blk_mq_free_tag_set(&ctrl->admin_tag_set);
out_free_queues: out_free_queues:
kfree(ctrl->queues); kfree(ctrl->queues);
out_free_ida: out_free_ida:

View File

@@ -136,9 +136,11 @@ static int nvme_submit_user_cmd(struct request_queue *q,
unsigned bufflen, void __user *meta_buffer, unsigned meta_len, unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
u32 meta_seed, u64 *result, unsigned timeout, bool vec) u32 meta_seed, u64 *result, unsigned timeout, bool vec)
{ {
struct nvme_ctrl *ctrl;
struct request *req; struct request *req;
void *meta = NULL; void *meta = NULL;
struct bio *bio; struct bio *bio;
u32 effects;
int ret; int ret;
req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer, req = nvme_alloc_user_request(q, cmd, ubuffer, bufflen, meta_buffer,
@@ -147,8 +149,9 @@ static int nvme_submit_user_cmd(struct request_queue *q,
return PTR_ERR(req); return PTR_ERR(req);
bio = req->bio; bio = req->bio;
ctrl = nvme_req(req)->ctrl;
ret = nvme_execute_passthru_rq(req); ret = nvme_execute_passthru_rq(req, &effects);
if (result) if (result)
*result = le64_to_cpu(nvme_req(req)->result.u64); *result = le64_to_cpu(nvme_req(req)->result.u64);
@@ -158,6 +161,10 @@ static int nvme_submit_user_cmd(struct request_queue *q,
if (bio) if (bio)
blk_rq_unmap_user(bio); blk_rq_unmap_user(bio);
blk_mq_free_request(req); blk_mq_free_request(req);
if (effects)
nvme_passthru_end(ctrl, effects, cmd, ret);
return ret; return ret;
} }
@@ -824,11 +831,17 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd,
case NVME_IOCTL_IO_CMD: case NVME_IOCTL_IO_CMD:
return nvme_dev_user_cmd(ctrl, argp); return nvme_dev_user_cmd(ctrl, argp);
case NVME_IOCTL_RESET: case NVME_IOCTL_RESET:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
dev_warn(ctrl->device, "resetting controller\n"); dev_warn(ctrl->device, "resetting controller\n");
return nvme_reset_ctrl_sync(ctrl); return nvme_reset_ctrl_sync(ctrl);
case NVME_IOCTL_SUBSYS_RESET: case NVME_IOCTL_SUBSYS_RESET:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
return nvme_reset_subsystem(ctrl); return nvme_reset_subsystem(ctrl);
case NVME_IOCTL_RESCAN: case NVME_IOCTL_RESCAN:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
nvme_queue_scan(ctrl); nvme_queue_scan(ctrl);
return 0; return 0;
default: default:

View File

@@ -233,6 +233,12 @@ struct nvme_fault_inject {
#endif #endif
}; };
enum nvme_ctrl_flags {
NVME_CTRL_FAILFAST_EXPIRED = 0,
NVME_CTRL_ADMIN_Q_STOPPED = 1,
NVME_CTRL_STARTED_ONCE = 2,
};
struct nvme_ctrl { struct nvme_ctrl {
bool comp_seen; bool comp_seen;
enum nvme_ctrl_state state; enum nvme_ctrl_state state;
@@ -354,8 +360,6 @@ struct nvme_ctrl {
u16 maxcmd; u16 maxcmd;
int nr_reconnects; int nr_reconnects;
unsigned long flags; unsigned long flags;
#define NVME_CTRL_FAILFAST_EXPIRED 0
#define NVME_CTRL_ADMIN_Q_STOPPED 1
struct nvmf_ctrl_options *opts; struct nvmf_ctrl_options *opts;
struct page *discard_page; struct page *discard_page;
@@ -602,11 +606,23 @@ static inline void nvme_fault_inject_fini(struct nvme_fault_inject *fault_inj)
static inline void nvme_should_fail(struct request *req) {} static inline void nvme_should_fail(struct request *req) {}
#endif #endif
bool nvme_wait_reset(struct nvme_ctrl *ctrl);
int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
{ {
int ret;
if (!ctrl->subsystem) if (!ctrl->subsystem)
return -ENOTTY; return -ENOTTY;
return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); if (!nvme_wait_reset(ctrl))
return -EBUSY;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
if (ret)
return ret;
return nvme_try_sched_reset(ctrl);
} }
/* /*
@@ -712,7 +728,6 @@ void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state); enum nvme_ctrl_state new_state);
bool nvme_wait_reset(struct nvme_ctrl *ctrl);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl); int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl); int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
@@ -722,6 +737,14 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
void nvme_start_ctrl(struct nvme_ctrl *ctrl); void nvme_start_ctrl(struct nvme_ctrl *ctrl);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl); void nvme_stop_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl); int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int flags,
unsigned int cmd_size);
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl);
int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int flags,
unsigned int cmd_size);
void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl);
void nvme_remove_namespaces(struct nvme_ctrl *ctrl); void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
@@ -802,7 +825,6 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
void nvme_queue_scan(struct nvme_ctrl *ctrl); void nvme_queue_scan(struct nvme_ctrl *ctrl);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi, int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, u8 csi,
@@ -972,14 +994,6 @@ static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
} }
#endif #endif
static inline int nvme_ctrl_init_connect_q(struct nvme_ctrl *ctrl)
{
ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ctrl->connect_q))
return PTR_ERR(ctrl->connect_q);
return 0;
}
static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
{ {
return dev_to_disk(dev)->private_data; return dev_to_disk(dev)->private_data;
@@ -1027,7 +1041,9 @@ static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {};
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u8 opcode); u8 opcode);
int nvme_execute_passthru_rq(struct request *rq); int nvme_execute_passthru_rq(struct request *rq, u32 *effects);
void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
struct nvme_command *cmd, int status);
struct nvme_ctrl *nvme_ctrl_from_file(struct file *file); struct nvme_ctrl *nvme_ctrl_from_file(struct file *file);
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid); struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
void nvme_put_ns(struct nvme_ns *ns); void nvme_put_ns(struct nvme_ns *ns);

View File

@@ -226,12 +226,12 @@ struct nvme_queue {
struct nvme_iod { struct nvme_iod {
struct nvme_request req; struct nvme_request req;
struct nvme_command cmd; struct nvme_command cmd;
struct nvme_queue *nvmeq;
bool use_sgl; bool use_sgl;
int aborted; bool aborted;
int npages; /* In the PRP list. 0 means small pool in use */ s8 nr_allocations; /* PRP list pool allocations. 0 means small
dma_addr_t first_dma; pool in use */
unsigned int dma_len; /* length of single DMA segment mapping */ unsigned int dma_len; /* length of single DMA segment mapping */
dma_addr_t first_dma;
dma_addr_t meta_dma; dma_addr_t meta_dma;
struct sg_table sgt; struct sg_table sgt;
}; };
@@ -430,11 +430,6 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set,
{ {
struct nvme_dev *dev = set->driver_data; struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
struct nvme_queue *nvmeq = &dev->queues[queue_idx];
BUG_ON(!nvmeq);
iod->nvmeq = nvmeq;
nvme_req(req)->ctrl = &dev->ctrl; nvme_req(req)->ctrl = &dev->ctrl;
nvme_req(req)->cmd = &iod->cmd; nvme_req(req)->cmd = &iod->cmd;
@@ -450,7 +445,7 @@ static int queue_irq_offset(struct nvme_dev *dev)
return 0; return 0;
} }
static int nvme_pci_map_queues(struct blk_mq_tag_set *set) static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
{ {
struct nvme_dev *dev = set->driver_data; struct nvme_dev *dev = set->driver_data;
int i, qoff, offset; int i, qoff, offset;
@@ -477,8 +472,6 @@ static int nvme_pci_map_queues(struct blk_mq_tag_set *set)
qoff += map->nr_queues; qoff += map->nr_queues;
offset += map->nr_queues; offset += map->nr_queues;
} }
return 0;
} }
/* /*
@@ -528,7 +521,7 @@ static void **nvme_pci_iod_list(struct request *req)
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
{ {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
int nseg = blk_rq_nr_phys_segments(req); int nseg = blk_rq_nr_phys_segments(req);
unsigned int avg_seg_size; unsigned int avg_seg_size;
@@ -536,7 +529,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
if (!nvme_ctrl_sgl_supported(&dev->ctrl)) if (!nvme_ctrl_sgl_supported(&dev->ctrl))
return false; return false;
if (!iod->nvmeq->qid) if (!nvmeq->qid)
return false; return false;
if (!sgl_threshold || avg_seg_size < sgl_threshold) if (!sgl_threshold || avg_seg_size < sgl_threshold)
return false; return false;
@@ -550,7 +543,7 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
dma_addr_t dma_addr = iod->first_dma; dma_addr_t dma_addr = iod->first_dma;
int i; int i;
for (i = 0; i < iod->npages; i++) { for (i = 0; i < iod->nr_allocations; i++) {
__le64 *prp_list = nvme_pci_iod_list(req)[i]; __le64 *prp_list = nvme_pci_iod_list(req)[i];
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
@@ -566,7 +559,7 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
dma_addr_t dma_addr = iod->first_dma; dma_addr_t dma_addr = iod->first_dma;
int i; int i;
for (i = 0; i < iod->npages; i++) { for (i = 0; i < iod->nr_allocations; i++) {
struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i]; struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i];
dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr); dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr);
@@ -589,7 +582,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
if (iod->npages == 0) if (iod->nr_allocations == 0)
dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
iod->first_dma); iod->first_dma);
else if (iod->use_sgl) else if (iod->use_sgl)
@@ -651,15 +644,15 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
if (nprps <= (256 / 8)) { if (nprps <= (256 / 8)) {
pool = dev->prp_small_pool; pool = dev->prp_small_pool;
iod->npages = 0; iod->nr_allocations = 0;
} else { } else {
pool = dev->prp_page_pool; pool = dev->prp_page_pool;
iod->npages = 1; iod->nr_allocations = 1;
} }
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list) { if (!prp_list) {
iod->npages = -1; iod->nr_allocations = -1;
return BLK_STS_RESOURCE; return BLK_STS_RESOURCE;
} }
list[0] = prp_list; list[0] = prp_list;
@@ -671,7 +664,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list) if (!prp_list)
goto free_prps; goto free_prps;
list[iod->npages++] = prp_list; list[iod->nr_allocations++] = prp_list;
prp_list[0] = old_prp_list[i - 1]; prp_list[0] = old_prp_list[i - 1];
old_prp_list[i - 1] = cpu_to_le64(prp_dma); old_prp_list[i - 1] = cpu_to_le64(prp_dma);
i = 1; i = 1;
@@ -746,15 +739,15 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
pool = dev->prp_small_pool; pool = dev->prp_small_pool;
iod->npages = 0; iod->nr_allocations = 0;
} else { } else {
pool = dev->prp_page_pool; pool = dev->prp_page_pool;
iod->npages = 1; iod->nr_allocations = 1;
} }
sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
if (!sg_list) { if (!sg_list) {
iod->npages = -1; iod->nr_allocations = -1;
return BLK_STS_RESOURCE; return BLK_STS_RESOURCE;
} }
@@ -773,7 +766,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
goto free_sgls; goto free_sgls;
i = 0; i = 0;
nvme_pci_iod_list(req)[iod->npages++] = sg_list; nvme_pci_iod_list(req)[iod->nr_allocations++] = sg_list;
sg_list[i++] = *link; sg_list[i++] = *link;
nvme_pci_sgl_set_seg(link, sgl_dma, entries); nvme_pci_sgl_set_seg(link, sgl_dma, entries);
} }
@@ -833,6 +826,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
int rc; int rc;
if (blk_rq_nr_phys_segments(req) == 1) { if (blk_rq_nr_phys_segments(req) == 1) {
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct bio_vec bv = req_bvec(req); struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) { if (!is_pci_p2pdma_page(bv.bv_page)) {
@@ -840,7 +834,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
return nvme_setup_prp_simple(dev, req, return nvme_setup_prp_simple(dev, req,
&cmnd->rw, &bv); &cmnd->rw, &bv);
if (iod->nvmeq->qid && sgl_threshold && if (nvmeq->qid && sgl_threshold &&
nvme_ctrl_sgl_supported(&dev->ctrl)) nvme_ctrl_sgl_supported(&dev->ctrl))
return nvme_setup_sgl_simple(dev, req, return nvme_setup_sgl_simple(dev, req,
&cmnd->rw, &bv); &cmnd->rw, &bv);
@@ -898,8 +892,8 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret; blk_status_t ret;
iod->aborted = 0; iod->aborted = false;
iod->npages = -1; iod->nr_allocations = -1;
iod->sgt.nents = 0; iod->sgt.nents = 0;
ret = nvme_setup_cmd(req->q->queuedata, req); ret = nvme_setup_cmd(req->q->queuedata, req);
@@ -1019,12 +1013,16 @@ static void nvme_queue_rqs(struct request **rqlist)
static __always_inline void nvme_pci_unmap_rq(struct request *req) static __always_inline void nvme_pci_unmap_rq(struct request *req)
{ {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_dev *dev = iod->nvmeq->dev; struct nvme_dev *dev = nvmeq->dev;
if (blk_integrity_rq(req)) {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
if (blk_integrity_rq(req))
dma_unmap_page(dev->dev, iod->meta_dma, dma_unmap_page(dev->dev, iod->meta_dma,
rq_integrity_vec(req)->bv_len, rq_data_dir(req)); rq_integrity_vec(req)->bv_len, rq_data_dir(req));
}
if (blk_rq_nr_phys_segments(req)) if (blk_rq_nr_phys_segments(req))
nvme_unmap_data(dev, req); nvme_unmap_data(dev, req);
} }
@@ -1272,8 +1270,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
static void abort_endio(struct request *req, blk_status_t error) static void abort_endio(struct request *req, blk_status_t error)
{ {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_queue *nvmeq = iod->nvmeq;
dev_warn(nvmeq->dev->ctrl.device, dev_warn(nvmeq->dev->ctrl.device,
"Abort status: 0x%x", nvme_req(req)->status); "Abort status: 0x%x", nvme_req(req)->status);
@@ -1335,7 +1332,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
static enum blk_eh_timer_return nvme_timeout(struct request *req) static enum blk_eh_timer_return nvme_timeout(struct request *req)
{ {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = iod->nvmeq; struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev; struct nvme_dev *dev = nvmeq->dev;
struct request *abort_req; struct request *abort_req;
struct nvme_command cmd = { }; struct nvme_command cmd = { };
@@ -1416,7 +1413,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
atomic_inc(&dev->ctrl.abort_limit); atomic_inc(&dev->ctrl.abort_limit);
return BLK_EH_RESET_TIMER; return BLK_EH_RESET_TIMER;
} }
iod->aborted = 1; iod->aborted = true;
cmd.abort.opcode = nvme_admin_abort_cmd; cmd.abort.opcode = nvme_admin_abort_cmd;
cmd.abort.cid = nvme_cid(req); cmd.abort.cid = nvme_cid(req);
@@ -2529,9 +2526,11 @@ static void nvme_pci_alloc_tag_set(struct nvme_dev *dev)
set->ops = &nvme_mq_ops; set->ops = &nvme_mq_ops;
set->nr_hw_queues = dev->online_queues - 1; set->nr_hw_queues = dev->online_queues - 1;
set->nr_maps = 2; /* default + read */ set->nr_maps = 1;
if (dev->io_queues[HCTX_TYPE_READ])
set->nr_maps = 2;
if (dev->io_queues[HCTX_TYPE_POLL]) if (dev->io_queues[HCTX_TYPE_POLL])
set->nr_maps++; set->nr_maps = 3;
set->timeout = NVME_IO_TIMEOUT; set->timeout = NVME_IO_TIMEOUT;
set->numa_node = dev->ctrl.numa_node; set->numa_node = dev->ctrl.numa_node;
set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
@@ -2834,6 +2833,8 @@ static void nvme_reset_work(struct work_struct *work)
nvme_start_admin_queue(&dev->ctrl); nvme_start_admin_queue(&dev->ctrl);
} }
dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
/* /*
* Limit the max command size to prevent iod->sg allocations going * Limit the max command size to prevent iod->sg allocations going
* over a single page. * over a single page.
@@ -2846,7 +2847,6 @@ static void nvme_reset_work(struct work_struct *work)
* Don't limit the IOMMU merged segment size. * Don't limit the IOMMU merged segment size.
*/ */
dma_set_max_seg_size(dev->dev, 0xffffffff); dma_set_max_seg_size(dev->dev, 0xffffffff);
dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
mutex_unlock(&dev->shutdown_lock); mutex_unlock(&dev->shutdown_lock);
@@ -3569,6 +3569,8 @@ static int __init nvme_init(void)
BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) >
S8_MAX);
return pci_register_driver(&nvme_driver); return pci_register_driver(&nvme_driver);
} }

View File

@@ -295,7 +295,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
struct request *rq, unsigned int hctx_idx, struct request *rq, unsigned int hctx_idx,
unsigned int numa_node) unsigned int numa_node)
{ {
struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data);
struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx]; struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
@@ -320,7 +320,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct nvme_rdma_ctrl *ctrl = data; struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(data);
struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1]; struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
@@ -332,7 +332,7 @@ static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct nvme_rdma_ctrl *ctrl = data; struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(data);
struct nvme_rdma_queue *queue = &ctrl->queues[0]; struct nvme_rdma_queue *queue = &ctrl->queues[0];
BUG_ON(hctx_idx != 0); BUG_ON(hctx_idx != 0);
@@ -696,11 +696,12 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
return ret; return ret;
} }
static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl) static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl,
int first, int last)
{ {
int i, ret = 0; int i, ret = 0;
for (i = 1; i < ctrl->ctrl.queue_count; i++) { for (i = first; i < last; i++) {
ret = nvme_rdma_start_queue(ctrl, i); ret = nvme_rdma_start_queue(ctrl, i);
if (ret) if (ret)
goto out_stop_queues; goto out_stop_queues;
@@ -709,7 +710,7 @@ static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl)
return 0; return 0;
out_stop_queues: out_stop_queues:
for (i--; i >= 1; i--) for (i--; i >= first; i--)
nvme_rdma_stop_queue(&ctrl->queues[i]); nvme_rdma_stop_queue(&ctrl->queues[i]);
return ret; return ret;
} }
@@ -787,64 +788,21 @@ out_free_queues:
return ret; return ret;
} }
static int nvme_rdma_alloc_admin_tag_set(struct nvme_ctrl *nctrl) static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl)
{ {
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); unsigned int cmd_size = sizeof(struct nvme_rdma_request) +
struct blk_mq_tag_set *set = &ctrl->admin_tag_set; NVME_RDMA_DATA_SGL_SIZE;
int ret;
memset(set, 0, sizeof(*set)); if (ctrl->max_integrity_segments)
set->ops = &nvme_rdma_admin_mq_ops; cmd_size += sizeof(struct nvme_rdma_sgl) +
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; NVME_RDMA_METADATA_SGL_SIZE;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node; return nvme_alloc_io_tag_set(ctrl, &to_rdma_ctrl(ctrl)->tag_set,
set->cmd_size = sizeof(struct nvme_rdma_request) + &nvme_rdma_mq_ops, BLK_MQ_F_SHOULD_MERGE, cmd_size);
NVME_RDMA_DATA_SGL_SIZE;
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = NVME_ADMIN_TIMEOUT;
set->flags = BLK_MQ_F_NO_SCHED;
ret = blk_mq_alloc_tag_set(set);
if (!ret)
ctrl->ctrl.admin_tagset = set;
return ret;
} }
static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *nctrl) static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl)
{ {
struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
struct blk_mq_tag_set *set = &ctrl->tag_set;
int ret;
memset(set, 0, sizeof(*set));
set->ops = &nvme_rdma_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
NVME_RDMA_DATA_SGL_SIZE;
if (nctrl->max_integrity_segments)
set->cmd_size += sizeof(struct nvme_rdma_sgl) +
NVME_RDMA_METADATA_SGL_SIZE;
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
ret = blk_mq_alloc_tag_set(set);
if (!ret)
ctrl->ctrl.tagset = set;
return ret;
}
static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
if (remove) {
blk_mq_destroy_queue(ctrl->ctrl.admin_q);
blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
}
if (ctrl->async_event_sqe.data) { if (ctrl->async_event_sqe.data) {
cancel_work_sync(&ctrl->ctrl.async_event_work); cancel_work_sync(&ctrl->ctrl.async_event_work);
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
@@ -886,26 +844,19 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
goto out_free_queue; goto out_free_queue;
if (new) { if (new) {
error = nvme_rdma_alloc_admin_tag_set(&ctrl->ctrl); error = nvme_alloc_admin_tag_set(&ctrl->ctrl,
&ctrl->admin_tag_set, &nvme_rdma_admin_mq_ops,
BLK_MQ_F_NO_SCHED,
sizeof(struct nvme_rdma_request) +
NVME_RDMA_DATA_SGL_SIZE);
if (error) if (error)
goto out_free_async_qe; goto out_free_async_qe;
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
error = PTR_ERR(ctrl->ctrl.fabrics_q);
goto out_free_tagset;
}
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
error = PTR_ERR(ctrl->ctrl.admin_q);
goto out_cleanup_fabrics_q;
}
} }
error = nvme_rdma_start_queue(ctrl, 0); error = nvme_rdma_start_queue(ctrl, 0);
if (error) if (error)
goto out_cleanup_queue; goto out_remove_admin_tag_set;
error = nvme_enable_ctrl(&ctrl->ctrl); error = nvme_enable_ctrl(&ctrl->ctrl);
if (error) if (error)
@@ -932,15 +883,9 @@ out_quiesce_queue:
out_stop_queue: out_stop_queue:
nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_rdma_stop_queue(&ctrl->queues[0]);
nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_cancel_admin_tagset(&ctrl->ctrl);
out_cleanup_queue: out_remove_admin_tag_set:
if (new) if (new)
blk_mq_destroy_queue(ctrl->ctrl.admin_q); nvme_remove_admin_tag_set(&ctrl->ctrl);
out_cleanup_fabrics_q:
if (new)
blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
out_free_tagset:
if (new)
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
out_free_async_qe: out_free_async_qe:
if (ctrl->async_event_sqe.data) { if (ctrl->async_event_sqe.data) {
nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe, nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
@@ -952,19 +897,9 @@ out_free_queue:
return error; return error;
} }
static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
if (remove) {
blk_mq_destroy_queue(ctrl->ctrl.connect_q);
blk_mq_free_tag_set(ctrl->ctrl.tagset);
}
nvme_rdma_free_io_queues(ctrl);
}
static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
{ {
int ret; int ret, nr_queues;
ret = nvme_rdma_alloc_io_queues(ctrl); ret = nvme_rdma_alloc_io_queues(ctrl);
if (ret) if (ret)
@@ -974,15 +909,17 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl); ret = nvme_rdma_alloc_tag_set(&ctrl->ctrl);
if (ret) if (ret)
goto out_free_io_queues; goto out_free_io_queues;
ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl));
if (ret)
goto out_free_tag_set;
} }
ret = nvme_rdma_start_io_queues(ctrl); /*
* Only start IO queues for which we have allocated the tagset
* and limitted it to the available queues. On reconnects, the
* queue number might have changed.
*/
nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count);
ret = nvme_rdma_start_io_queues(ctrl, 1, nr_queues);
if (ret) if (ret)
goto out_cleanup_connect_q; goto out_cleanup_tagset;
if (!new) { if (!new) {
nvme_start_queues(&ctrl->ctrl); nvme_start_queues(&ctrl->ctrl);
@@ -1000,19 +937,25 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
nvme_unfreeze(&ctrl->ctrl); nvme_unfreeze(&ctrl->ctrl);
} }
/*
* If the number of queues has increased (reconnect case)
* start all new queues now.
*/
ret = nvme_rdma_start_io_queues(ctrl, nr_queues,
ctrl->tag_set.nr_hw_queues + 1);
if (ret)
goto out_wait_freeze_timed_out;
return 0; return 0;
out_wait_freeze_timed_out: out_wait_freeze_timed_out:
nvme_stop_queues(&ctrl->ctrl); nvme_stop_queues(&ctrl->ctrl);
nvme_sync_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl); nvme_rdma_stop_io_queues(ctrl);
out_cleanup_connect_q: out_cleanup_tagset:
nvme_cancel_tagset(&ctrl->ctrl); nvme_cancel_tagset(&ctrl->ctrl);
if (new) if (new)
blk_mq_destroy_queue(ctrl->ctrl.connect_q); nvme_remove_io_tag_set(&ctrl->ctrl);
out_free_tag_set:
if (new)
blk_mq_free_tag_set(ctrl->ctrl.tagset);
out_free_io_queues: out_free_io_queues:
nvme_rdma_free_io_queues(ctrl); nvme_rdma_free_io_queues(ctrl);
return ret; return ret;
@@ -1025,9 +968,11 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
blk_sync_queue(ctrl->ctrl.admin_q); blk_sync_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_rdma_stop_queue(&ctrl->queues[0]);
nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_cancel_admin_tagset(&ctrl->ctrl);
if (remove) if (remove) {
nvme_start_admin_queue(&ctrl->ctrl); nvme_start_admin_queue(&ctrl->ctrl);
nvme_rdma_destroy_admin_queue(ctrl, remove); nvme_remove_admin_tag_set(&ctrl->ctrl);
}
nvme_rdma_destroy_admin_queue(ctrl);
} }
static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
@@ -1039,9 +984,11 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
nvme_sync_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl); nvme_rdma_stop_io_queues(ctrl);
nvme_cancel_tagset(&ctrl->ctrl); nvme_cancel_tagset(&ctrl->ctrl);
if (remove) if (remove) {
nvme_start_queues(&ctrl->ctrl); nvme_start_queues(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, remove); nvme_remove_io_tag_set(&ctrl->ctrl);
}
nvme_rdma_free_io_queues(ctrl);
} }
} }
@@ -1163,14 +1110,18 @@ destroy_io:
nvme_sync_io_queues(&ctrl->ctrl); nvme_sync_io_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl); nvme_rdma_stop_io_queues(ctrl);
nvme_cancel_tagset(&ctrl->ctrl); nvme_cancel_tagset(&ctrl->ctrl);
nvme_rdma_destroy_io_queues(ctrl, new); if (new)
nvme_remove_io_tag_set(&ctrl->ctrl);
nvme_rdma_free_io_queues(ctrl);
} }
destroy_admin: destroy_admin:
nvme_stop_admin_queue(&ctrl->ctrl); nvme_stop_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q); blk_sync_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]); nvme_rdma_stop_queue(&ctrl->queues[0]);
nvme_cancel_admin_tagset(&ctrl->ctrl); nvme_cancel_admin_tagset(&ctrl->ctrl);
nvme_rdma_destroy_admin_queue(ctrl, new); if (new)
nvme_remove_admin_tag_set(&ctrl->ctrl);
nvme_rdma_destroy_admin_queue(ctrl);
return ret; return ret;
} }
@@ -2188,9 +2139,9 @@ static void nvme_rdma_complete_rq(struct request *rq)
nvme_complete_rq(rq); nvme_complete_rq(rq);
} }
static int nvme_rdma_map_queues(struct blk_mq_tag_set *set) static void nvme_rdma_map_queues(struct blk_mq_tag_set *set)
{ {
struct nvme_rdma_ctrl *ctrl = set->driver_data; struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(set->driver_data);
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
@@ -2231,8 +2182,6 @@ static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
ctrl->io_queues[HCTX_TYPE_DEFAULT], ctrl->io_queues[HCTX_TYPE_DEFAULT],
ctrl->io_queues[HCTX_TYPE_READ], ctrl->io_queues[HCTX_TYPE_READ],
ctrl->io_queues[HCTX_TYPE_POLL]); ctrl->io_queues[HCTX_TYPE_POLL]);
return 0;
} }
static const struct blk_mq_ops nvme_rdma_mq_ops = { static const struct blk_mq_ops nvme_rdma_mq_ops = {

View File

@@ -133,7 +133,6 @@ struct nvme_tcp_queue {
/* send state */ /* send state */
struct nvme_tcp_request *request; struct nvme_tcp_request *request;
int queue_size;
u32 maxh2cdata; u32 maxh2cdata;
size_t cmnd_capsule_len; size_t cmnd_capsule_len;
struct nvme_tcp_ctrl *ctrl; struct nvme_tcp_ctrl *ctrl;
@@ -463,7 +462,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
struct request *rq, unsigned int hctx_idx, struct request *rq, unsigned int hctx_idx,
unsigned int numa_node) unsigned int numa_node)
{ {
struct nvme_tcp_ctrl *ctrl = set->driver_data; struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
struct nvme_tcp_cmd_pdu *pdu; struct nvme_tcp_cmd_pdu *pdu;
int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
@@ -487,7 +486,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct nvme_tcp_ctrl *ctrl = data; struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1]; struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
hctx->driver_data = queue; hctx->driver_data = queue;
@@ -497,7 +496,7 @@ static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct nvme_tcp_ctrl *ctrl = data; struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data);
struct nvme_tcp_queue *queue = &ctrl->queues[0]; struct nvme_tcp_queue *queue = &ctrl->queues[0];
hctx->driver_data = queue; hctx->driver_data = queue;
@@ -1476,8 +1475,7 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
} }
static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
int qid, size_t queue_size)
{ {
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct nvme_tcp_queue *queue = &ctrl->queues[qid]; struct nvme_tcp_queue *queue = &ctrl->queues[qid];
@@ -1489,7 +1487,6 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
INIT_LIST_HEAD(&queue->send_list); INIT_LIST_HEAD(&queue->send_list);
mutex_init(&queue->send_mutex); mutex_init(&queue->send_mutex);
INIT_WORK(&queue->io_work, nvme_tcp_io_work); INIT_WORK(&queue->io_work, nvme_tcp_io_work);
queue->queue_size = queue_size;
if (qid > 0) if (qid > 0)
queue->cmnd_capsule_len = nctrl->ioccsz * 16; queue->cmnd_capsule_len = nctrl->ioccsz * 16;
@@ -1687,51 +1684,6 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
return ret; return ret;
} }
static int nvme_tcp_alloc_admin_tag_set(struct nvme_ctrl *nctrl)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct blk_mq_tag_set *set = &ctrl->admin_tag_set;
int ret;
memset(set, 0, sizeof(*set));
set->ops = &nvme_tcp_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = 1;
set->timeout = NVME_ADMIN_TIMEOUT;
ret = blk_mq_alloc_tag_set(set);
if (!ret)
nctrl->admin_tagset = set;
return ret;
}
static int nvme_tcp_alloc_tag_set(struct nvme_ctrl *nctrl)
{
struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
struct blk_mq_tag_set *set = &ctrl->tag_set;
int ret;
memset(set, 0, sizeof(*set));
set->ops = &nvme_tcp_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
set->driver_data = ctrl;
set->nr_hw_queues = nctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
ret = blk_mq_alloc_tag_set(set);
if (!ret)
nctrl->tagset = set;
return ret;
}
static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
{ {
if (to_tcp_ctrl(ctrl)->async_req.pdu) { if (to_tcp_ctrl(ctrl)->async_req.pdu) {
@@ -1759,11 +1711,12 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
nvme_tcp_stop_queue(ctrl, i); nvme_tcp_stop_queue(ctrl, i);
} }
static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,
int first, int last)
{ {
int i, ret; int i, ret;
for (i = 1; i < ctrl->queue_count; i++) { for (i = first; i < last; i++) {
ret = nvme_tcp_start_queue(ctrl, i); ret = nvme_tcp_start_queue(ctrl, i);
if (ret) if (ret)
goto out_stop_queues; goto out_stop_queues;
@@ -1772,7 +1725,7 @@ static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
return 0; return 0;
out_stop_queues: out_stop_queues:
for (i--; i >= 1; i--) for (i--; i >= first; i--)
nvme_tcp_stop_queue(ctrl, i); nvme_tcp_stop_queue(ctrl, i);
return ret; return ret;
} }
@@ -1781,7 +1734,7 @@ static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
{ {
int ret; int ret;
ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); ret = nvme_tcp_alloc_queue(ctrl, 0);
if (ret) if (ret)
return ret; return ret;
@@ -1801,7 +1754,7 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
int i, ret; int i, ret;
for (i = 1; i < ctrl->queue_count; i++) { for (i = 1; i < ctrl->queue_count; i++) {
ret = nvme_tcp_alloc_queue(ctrl, i, ctrl->sqsize + 1); ret = nvme_tcp_alloc_queue(ctrl, i);
if (ret) if (ret)
goto out_free_queues; goto out_free_queues;
} }
@@ -1889,32 +1842,35 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
{ {
nvme_tcp_stop_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl);
if (remove) { if (remove)
blk_mq_destroy_queue(ctrl->connect_q); nvme_remove_io_tag_set(ctrl);
blk_mq_free_tag_set(ctrl->tagset);
}
nvme_tcp_free_io_queues(ctrl); nvme_tcp_free_io_queues(ctrl);
} }
static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
{ {
int ret; int ret, nr_queues;
ret = nvme_tcp_alloc_io_queues(ctrl); ret = nvme_tcp_alloc_io_queues(ctrl);
if (ret) if (ret)
return ret; return ret;
if (new) { if (new) {
ret = nvme_tcp_alloc_tag_set(ctrl); ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set,
&nvme_tcp_mq_ops,
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING,
sizeof(struct nvme_tcp_request));
if (ret) if (ret)
goto out_free_io_queues; goto out_free_io_queues;
ret = nvme_ctrl_init_connect_q(ctrl);
if (ret)
goto out_free_tag_set;
} }
ret = nvme_tcp_start_io_queues(ctrl); /*
* Only start IO queues for which we have allocated the tagset
* and limitted it to the available queues. On reconnects, the
* queue number might have changed.
*/
nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues);
if (ret) if (ret)
goto out_cleanup_connect_q; goto out_cleanup_connect_q;
@@ -1934,6 +1890,15 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
nvme_unfreeze(ctrl); nvme_unfreeze(ctrl);
} }
/*
* If the number of queues has increased (reconnect case)
* start all new queues now.
*/
ret = nvme_tcp_start_io_queues(ctrl, nr_queues,
ctrl->tagset->nr_hw_queues + 1);
if (ret)
goto out_wait_freeze_timed_out;
return 0; return 0;
out_wait_freeze_timed_out: out_wait_freeze_timed_out:
@@ -1943,10 +1908,7 @@ out_wait_freeze_timed_out:
out_cleanup_connect_q: out_cleanup_connect_q:
nvme_cancel_tagset(ctrl); nvme_cancel_tagset(ctrl);
if (new) if (new)
blk_mq_destroy_queue(ctrl->connect_q); nvme_remove_io_tag_set(ctrl);
out_free_tag_set:
if (new)
blk_mq_free_tag_set(ctrl->tagset);
out_free_io_queues: out_free_io_queues:
nvme_tcp_free_io_queues(ctrl); nvme_tcp_free_io_queues(ctrl);
return ret; return ret;
@@ -1955,11 +1917,8 @@ out_free_io_queues:
static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
{ {
nvme_tcp_stop_queue(ctrl, 0); nvme_tcp_stop_queue(ctrl, 0);
if (remove) { if (remove)
blk_mq_destroy_queue(ctrl->admin_q); nvme_remove_admin_tag_set(ctrl);
blk_mq_destroy_queue(ctrl->fabrics_q);
blk_mq_free_tag_set(ctrl->admin_tagset);
}
nvme_tcp_free_admin_queue(ctrl); nvme_tcp_free_admin_queue(ctrl);
} }
@@ -1972,26 +1931,17 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
return error; return error;
if (new) { if (new) {
error = nvme_tcp_alloc_admin_tag_set(ctrl); error = nvme_alloc_admin_tag_set(ctrl,
&to_tcp_ctrl(ctrl)->admin_tag_set,
&nvme_tcp_admin_mq_ops, BLK_MQ_F_BLOCKING,
sizeof(struct nvme_tcp_request));
if (error) if (error)
goto out_free_queue; goto out_free_queue;
ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
if (IS_ERR(ctrl->fabrics_q)) {
error = PTR_ERR(ctrl->fabrics_q);
goto out_free_tagset;
}
ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
if (IS_ERR(ctrl->admin_q)) {
error = PTR_ERR(ctrl->admin_q);
goto out_cleanup_fabrics_q;
}
} }
error = nvme_tcp_start_queue(ctrl, 0); error = nvme_tcp_start_queue(ctrl, 0);
if (error) if (error)
goto out_cleanup_queue; goto out_cleanup_tagset;
error = nvme_enable_ctrl(ctrl); error = nvme_enable_ctrl(ctrl);
if (error) if (error)
@@ -2011,15 +1961,9 @@ out_quiesce_queue:
out_stop_queue: out_stop_queue:
nvme_tcp_stop_queue(ctrl, 0); nvme_tcp_stop_queue(ctrl, 0);
nvme_cancel_admin_tagset(ctrl); nvme_cancel_admin_tagset(ctrl);
out_cleanup_queue: out_cleanup_tagset:
if (new) if (new)
blk_mq_destroy_queue(ctrl->admin_q); nvme_remove_admin_tag_set(ctrl);
out_cleanup_fabrics_q:
if (new)
blk_mq_destroy_queue(ctrl->fabrics_q);
out_free_tagset:
if (new)
blk_mq_free_tag_set(ctrl->admin_tagset);
out_free_queue: out_free_queue:
nvme_tcp_free_admin_queue(ctrl); nvme_tcp_free_admin_queue(ctrl);
return error; return error;
@@ -2468,9 +2412,9 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_OK; return BLK_STS_OK;
} }
static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) static void nvme_tcp_map_queues(struct blk_mq_tag_set *set)
{ {
struct nvme_tcp_ctrl *ctrl = set->driver_data; struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data);
struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
@@ -2509,8 +2453,6 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
ctrl->io_queues[HCTX_TYPE_DEFAULT], ctrl->io_queues[HCTX_TYPE_DEFAULT],
ctrl->io_queues[HCTX_TYPE_READ], ctrl->io_queues[HCTX_TYPE_READ],
ctrl->io_queues[HCTX_TYPE_POLL]); ctrl->io_queues[HCTX_TYPE_POLL]);
return 0;
} }
static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
@@ -2529,6 +2471,25 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
return queue->nr_cqe; return queue->nr_cqe;
} }
static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
{
struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0];
struct sockaddr_storage src_addr;
int ret, len;
len = nvmf_get_address(ctrl, buf, size);
ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr);
if (ret > 0) {
if (len > 0)
len--; /* strip trailing newline */
len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n",
(len) ? "," : "", &src_addr);
}
return len;
}
static const struct blk_mq_ops nvme_tcp_mq_ops = { static const struct blk_mq_ops nvme_tcp_mq_ops = {
.queue_rq = nvme_tcp_queue_rq, .queue_rq = nvme_tcp_queue_rq,
.commit_rqs = nvme_tcp_commit_rqs, .commit_rqs = nvme_tcp_commit_rqs,
@@ -2560,7 +2521,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.free_ctrl = nvme_tcp_free_ctrl, .free_ctrl = nvme_tcp_free_ctrl,
.submit_async_event = nvme_tcp_submit_async_event, .submit_async_event = nvme_tcp_submit_async_event,
.delete_ctrl = nvme_tcp_delete_ctrl, .delete_ctrl = nvme_tcp_delete_ctrl,
.get_address = nvmf_get_address, .get_address = nvme_tcp_get_address,
.stop_ctrl = nvme_tcp_stop_ctrl, .stop_ctrl = nvme_tcp_stop_ctrl,
}; };

View File

@@ -449,7 +449,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
if (req->port->inline_data_size) if (req->port->inline_data_size)
id->sgls |= cpu_to_le32(1 << 20); id->sgls |= cpu_to_le32(1 << 20);
strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn));
/* /*
* Max command capsule size is sqe + in-capsule data size. * Max command capsule size is sqe + in-capsule data size.

View File

@@ -1281,6 +1281,34 @@ static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item,
CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable); CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable);
#endif #endif
static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item,
char *page)
{
return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->max_qid);
}
static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item,
const char *page, size_t cnt)
{
struct nvmet_port *port = to_nvmet_port(item);
u16 qid_max;
if (nvmet_is_port_enabled(port, __func__))
return -EACCES;
if (sscanf(page, "%hu\n", &qid_max) != 1)
return -EINVAL;
if (qid_max < 1 || qid_max > NVMET_NR_QUEUES)
return -EINVAL;
down_write(&nvmet_config_sem);
to_subsys(item)->max_qid = qid_max;
up_write(&nvmet_config_sem);
return cnt;
}
CONFIGFS_ATTR(nvmet_subsys_, attr_qid_max);
static struct configfs_attribute *nvmet_subsys_attrs[] = { static struct configfs_attribute *nvmet_subsys_attrs[] = {
&nvmet_subsys_attr_attr_allow_any_host, &nvmet_subsys_attr_attr_allow_any_host,
&nvmet_subsys_attr_attr_version, &nvmet_subsys_attr_attr_version,
@@ -1288,6 +1316,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = {
&nvmet_subsys_attr_attr_cntlid_min, &nvmet_subsys_attr_attr_cntlid_min,
&nvmet_subsys_attr_attr_cntlid_max, &nvmet_subsys_attr_attr_cntlid_max,
&nvmet_subsys_attr_attr_model, &nvmet_subsys_attr_attr_model,
&nvmet_subsys_attr_attr_qid_max,
#ifdef CONFIG_BLK_DEV_INTEGRITY #ifdef CONFIG_BLK_DEV_INTEGRITY
&nvmet_subsys_attr_attr_pi_enable, &nvmet_subsys_attr_attr_pi_enable,
#endif #endif

View File

@@ -832,6 +832,7 @@ int nvmet_sq_init(struct nvmet_sq *sq)
} }
init_completion(&sq->free_done); init_completion(&sq->free_done);
init_completion(&sq->confirm_done); init_completion(&sq->confirm_done);
nvmet_auth_sq_init(sq);
return 0; return 0;
} }

View File

@@ -292,7 +292,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req)
id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL); id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL);
strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn));
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));

View File

@@ -23,17 +23,12 @@ static void nvmet_auth_expired_work(struct work_struct *work)
sq->dhchap_tid = -1; sq->dhchap_tid = -1;
} }
void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req) void nvmet_auth_sq_init(struct nvmet_sq *sq)
{ {
u32 result = le32_to_cpu(req->cqe->result.u32);
/* Initialize in-band authentication */ /* Initialize in-band authentication */
INIT_DELAYED_WORK(&req->sq->auth_expired_work, INIT_DELAYED_WORK(&sq->auth_expired_work, nvmet_auth_expired_work);
nvmet_auth_expired_work); sq->authenticated = false;
req->sq->authenticated = false; sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
req->sq->dhchap_step = NVME_AUTH_DHCHAP_MESSAGE_NEGOTIATE;
result |= (u32)NVME_CONNECT_AUTHREQ_ATR << 16;
req->cqe->result.u32 = cpu_to_le32(result);
} }
static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d) static u16 nvmet_auth_negotiate(struct nvmet_req *req, void *d)
@@ -177,7 +172,7 @@ static u16 nvmet_auth_reply(struct nvmet_req *req, void *d)
return 0; return 0;
} }
static u16 nvmet_auth_failure2(struct nvmet_req *req, void *d) static u16 nvmet_auth_failure2(void *d)
{ {
struct nvmf_auth_dhchap_failure_data *data = d; struct nvmf_auth_dhchap_failure_data *data = d;
@@ -229,10 +224,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req)
} }
status = nvmet_copy_from_sgl(req, 0, d, tl); status = nvmet_copy_from_sgl(req, 0, d, tl);
if (status) { if (status)
kfree(d); goto done_kfree;
goto done;
}
data = d; data = d;
pr_debug("%s: ctrl %d qid %d type %d id %d step %x\n", __func__, pr_debug("%s: ctrl %d qid %d type %d id %d step %x\n", __func__,
@@ -310,7 +303,7 @@ void nvmet_execute_auth_send(struct nvmet_req *req)
goto done_kfree; goto done_kfree;
break; break;
case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2: case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2:
status = nvmet_auth_failure2(req, d); status = nvmet_auth_failure2(d);
if (status) { if (status) {
pr_warn("ctrl %d qid %d: authentication failed (%d)\n", pr_warn("ctrl %d qid %d: authentication failed (%d)\n",
ctrl->cntlid, req->sq->qid, status); ctrl->cntlid, req->sq->qid, status);

View File

@@ -198,6 +198,12 @@ err:
return ret; return ret;
} }
static u32 nvmet_connect_result(struct nvmet_ctrl *ctrl)
{
return (u32)ctrl->cntlid |
(nvmet_has_auth(ctrl) ? NVME_CONNECT_AUTHREQ_ATR : 0);
}
static void nvmet_execute_admin_connect(struct nvmet_req *req) static void nvmet_execute_admin_connect(struct nvmet_req *req)
{ {
struct nvmf_connect_command *c = &req->cmd->connect; struct nvmf_connect_command *c = &req->cmd->connect;
@@ -269,10 +275,7 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn, ctrl->cntlid, ctrl->subsys->subsysnqn, ctrl->hostnqn,
ctrl->pi_support ? " T10-PI is enabled" : "", ctrl->pi_support ? " T10-PI is enabled" : "",
nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : ""); nvmet_has_auth(ctrl) ? " with DH-HMAC-CHAP" : "");
req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl));
if (nvmet_has_auth(ctrl))
nvmet_init_auth(ctrl, req);
out: out:
kfree(d); kfree(d);
complete: complete:
@@ -328,14 +331,8 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
if (status) if (status)
goto out_ctrl_put; goto out_ctrl_put;
/* pass back cntlid for successful completion */
req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid);
pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); req->cqe->result.u32 = cpu_to_le32(nvmet_connect_result(ctrl));
if (nvmet_has_auth(ctrl))
nvmet_init_auth(ctrl, req);
out: out:
kfree(d); kfree(d);
complete: complete:

View File

@@ -12,11 +12,9 @@
void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id) void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
{ {
const struct queue_limits *ql = &bdev_get_queue(bdev)->limits;
/* Number of logical blocks per physical block. */
const u32 lpp = ql->physical_block_size / ql->logical_block_size;
/* Logical blocks per physical block, 0's based. */ /* Logical blocks per physical block, 0's based. */
const __le16 lpp0b = to0based(lpp); const __le16 lpp0b = to0based(bdev_physical_block_size(bdev) /
bdev_logical_block_size(bdev));
/* /*
* For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN, * For NVMe 1.2 and later, bit 1 indicates that the fields NAWUN,
@@ -42,11 +40,12 @@ void nvmet_bdev_set_limits(struct block_device *bdev, struct nvme_id_ns *id)
/* NPWA = Namespace Preferred Write Alignment. 0's based */ /* NPWA = Namespace Preferred Write Alignment. 0's based */
id->npwa = id->npwg; id->npwa = id->npwg;
/* NPDG = Namespace Preferred Deallocate Granularity. 0's based */ /* NPDG = Namespace Preferred Deallocate Granularity. 0's based */
id->npdg = to0based(ql->discard_granularity / ql->logical_block_size); id->npdg = to0based(bdev_discard_granularity(bdev) /
bdev_logical_block_size(bdev));
/* NPDG = Namespace Preferred Deallocate Alignment */ /* NPDG = Namespace Preferred Deallocate Alignment */
id->npda = id->npdg; id->npda = id->npdg;
/* NOWS = Namespace Optimal Write Size */ /* NOWS = Namespace Optimal Write Size */
id->nows = to0based(ql->io_opt / ql->logical_block_size); id->nows = to0based(bdev_io_opt(bdev) / bdev_logical_block_size(bdev));
} }
void nvmet_bdev_ns_disable(struct nvmet_ns *ns) void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
@@ -334,6 +333,11 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req)
{ {
struct bio *bio = &req->b.inline_bio; struct bio *bio = &req->b.inline_bio;
if (!bdev_write_cache(req->ns->bdev)) {
nvmet_req_complete(req, NVME_SC_SUCCESS);
return;
}
if (!nvmet_check_transfer_len(req, 0)) if (!nvmet_check_transfer_len(req, 0))
return; return;
@@ -347,6 +351,9 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req)
u16 nvmet_bdev_flush(struct nvmet_req *req) u16 nvmet_bdev_flush(struct nvmet_req *req)
{ {
if (!bdev_write_cache(req->ns->bdev))
return 0;
if (blkdev_issue_flush(req->ns->bdev)) if (blkdev_issue_flush(req->ns->bdev))
return NVME_SC_INTERNAL | NVME_SC_DNR; return NVME_SC_INTERNAL | NVME_SC_DNR;
return 0; return 0;

View File

@@ -204,7 +204,7 @@ static int nvme_loop_init_request(struct blk_mq_tag_set *set,
struct request *req, unsigned int hctx_idx, struct request *req, unsigned int hctx_idx,
unsigned int numa_node) unsigned int numa_node)
{ {
struct nvme_loop_ctrl *ctrl = set->driver_data; struct nvme_loop_ctrl *ctrl = to_loop_ctrl(set->driver_data);
struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req);
nvme_req(req)->ctrl = &ctrl->ctrl; nvme_req(req)->ctrl = &ctrl->ctrl;
@@ -218,7 +218,7 @@ static struct lock_class_key loop_hctx_fq_lock_key;
static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct nvme_loop_ctrl *ctrl = data; struct nvme_loop_ctrl *ctrl = to_loop_ctrl(data);
struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1];
BUG_ON(hctx_idx >= ctrl->ctrl.queue_count); BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
@@ -238,7 +238,7 @@ static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct nvme_loop_ctrl *ctrl = data; struct nvme_loop_ctrl *ctrl = to_loop_ctrl(data);
struct nvme_loop_queue *queue = &ctrl->queues[0]; struct nvme_loop_queue *queue = &ctrl->queues[0];
BUG_ON(hctx_idx != 0); BUG_ON(hctx_idx != 0);
@@ -266,9 +266,7 @@ static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl)
if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags)) if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags))
return; return;
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
blk_mq_destroy_queue(ctrl->ctrl.admin_q); nvme_remove_admin_tag_set(&ctrl->ctrl);
blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
blk_mq_free_tag_set(&ctrl->admin_tag_set);
} }
static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl)
@@ -282,10 +280,8 @@ static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl)
list_del(&ctrl->list); list_del(&ctrl->list);
mutex_unlock(&nvme_loop_ctrl_mutex); mutex_unlock(&nvme_loop_ctrl_mutex);
if (nctrl->tagset) { if (nctrl->tagset)
blk_mq_destroy_queue(ctrl->ctrl.connect_q); nvme_remove_io_tag_set(nctrl);
blk_mq_free_tag_set(&ctrl->tag_set);
}
kfree(ctrl->queues); kfree(ctrl->queues);
nvmf_free_options(nctrl->opts); nvmf_free_options(nctrl->opts);
free_ctrl: free_ctrl:
@@ -350,52 +346,31 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
{ {
int error; int error;
memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS;
ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
ctrl->admin_tag_set.driver_data = ctrl;
ctrl->admin_tag_set.nr_hw_queues = 1;
ctrl->admin_tag_set.timeout = NVME_ADMIN_TIMEOUT;
ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED;
ctrl->queues[0].ctrl = ctrl; ctrl->queues[0].ctrl = ctrl;
error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); error = nvmet_sq_init(&ctrl->queues[0].nvme_sq);
if (error) if (error)
return error; return error;
ctrl->ctrl.queue_count = 1; ctrl->ctrl.queue_count = 1;
error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
&nvme_loop_admin_mq_ops, BLK_MQ_F_NO_SCHED,
sizeof(struct nvme_loop_iod) +
NVME_INLINE_SG_CNT * sizeof(struct scatterlist));
if (error) if (error)
goto out_free_sq; goto out_free_sq;
ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set;
ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.fabrics_q)) {
error = PTR_ERR(ctrl->ctrl.fabrics_q);
goto out_free_tagset;
}
ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
if (IS_ERR(ctrl->ctrl.admin_q)) {
error = PTR_ERR(ctrl->ctrl.admin_q);
goto out_cleanup_fabrics_q;
}
/* reset stopped state for the fresh admin queue */ /* reset stopped state for the fresh admin queue */
clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags); clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->ctrl.flags);
error = nvmf_connect_admin_queue(&ctrl->ctrl); error = nvmf_connect_admin_queue(&ctrl->ctrl);
if (error) if (error)
goto out_cleanup_queue; goto out_cleanup_tagset;
set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); set_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
error = nvme_enable_ctrl(&ctrl->ctrl); error = nvme_enable_ctrl(&ctrl->ctrl);
if (error) if (error)
goto out_cleanup_queue; goto out_cleanup_tagset;
ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_hw_sectors =
(NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
@@ -404,17 +379,13 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
error = nvme_init_ctrl_finish(&ctrl->ctrl); error = nvme_init_ctrl_finish(&ctrl->ctrl);
if (error) if (error)
goto out_cleanup_queue; goto out_cleanup_tagset;
return 0; return 0;
out_cleanup_queue: out_cleanup_tagset:
clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags);
blk_mq_destroy_queue(ctrl->ctrl.admin_q); nvme_remove_admin_tag_set(&ctrl->ctrl);
out_cleanup_fabrics_q:
blk_mq_destroy_queue(ctrl->ctrl.fabrics_q);
out_free_tagset:
blk_mq_free_tag_set(&ctrl->admin_tag_set);
out_free_sq: out_free_sq:
nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); nvmet_sq_destroy(&ctrl->queues[0].nvme_sq);
return error; return error;
@@ -522,37 +493,21 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
if (ret) if (ret)
return ret; return ret;
memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set,
ctrl->tag_set.ops = &nvme_loop_mq_ops; &nvme_loop_mq_ops, BLK_MQ_F_SHOULD_MERGE,
ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size; sizeof(struct nvme_loop_iod) +
ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS; NVME_INLINE_SG_CNT * sizeof(struct scatterlist));
ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
ctrl->tag_set.driver_data = ctrl;
ctrl->tag_set.nr_hw_queues = ctrl->ctrl.queue_count - 1;
ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
ctrl->ctrl.tagset = &ctrl->tag_set;
ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
if (ret) if (ret)
goto out_destroy_queues; goto out_destroy_queues;
ret = nvme_ctrl_init_connect_q(&(ctrl->ctrl));
if (ret)
goto out_free_tagset;
ret = nvme_loop_connect_io_queues(ctrl); ret = nvme_loop_connect_io_queues(ctrl);
if (ret) if (ret)
goto out_cleanup_connect_q; goto out_cleanup_tagset;
return 0; return 0;
out_cleanup_connect_q: out_cleanup_tagset:
blk_mq_destroy_queue(ctrl->ctrl.connect_q); nvme_remove_io_tag_set(&ctrl->ctrl);
out_free_tagset:
blk_mq_free_tag_set(&ctrl->tag_set);
out_destroy_queues: out_destroy_queues:
nvme_loop_destroy_io_queues(ctrl); nvme_loop_destroy_io_queues(ctrl);
return ret; return ret;
@@ -601,7 +556,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
ret = -ENOMEM; ret = -ENOMEM;
ctrl->ctrl.sqsize = opts->queue_size - 1;
ctrl->ctrl.kato = opts->kato; ctrl->ctrl.kato = opts->kato;
ctrl->port = nvme_loop_find_port(&ctrl->ctrl); ctrl->port = nvme_loop_find_port(&ctrl->ctrl);
@@ -621,6 +575,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
opts->queue_size, ctrl->ctrl.maxcmd); opts->queue_size, ctrl->ctrl.maxcmd);
opts->queue_size = ctrl->ctrl.maxcmd; opts->queue_size = ctrl->ctrl.maxcmd;
} }
ctrl->ctrl.sqsize = opts->queue_size - 1;
if (opts->nr_io_queues) { if (opts->nr_io_queues) {
ret = nvme_loop_create_io_queues(ctrl); ret = nvme_loop_create_io_queues(ctrl);

View File

@@ -704,7 +704,7 @@ int nvmet_auth_set_key(struct nvmet_host *host, const char *secret,
bool set_ctrl); bool set_ctrl);
int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash); int nvmet_auth_set_host_hash(struct nvmet_host *host, const char *hash);
int nvmet_setup_auth(struct nvmet_ctrl *ctrl); int nvmet_setup_auth(struct nvmet_ctrl *ctrl);
void nvmet_init_auth(struct nvmet_ctrl *ctrl, struct nvmet_req *req); void nvmet_auth_sq_init(struct nvmet_sq *sq);
void nvmet_destroy_auth(struct nvmet_ctrl *ctrl); void nvmet_destroy_auth(struct nvmet_ctrl *ctrl);
void nvmet_auth_sq_free(struct nvmet_sq *sq); void nvmet_auth_sq_free(struct nvmet_sq *sq);
int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id); int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id);
@@ -726,8 +726,9 @@ static inline int nvmet_setup_auth(struct nvmet_ctrl *ctrl)
{ {
return 0; return 0;
} }
static inline void nvmet_init_auth(struct nvmet_ctrl *ctrl, static inline void nvmet_auth_sq_init(struct nvmet_sq *sq)
struct nvmet_req *req) {}; {
}
static inline void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) {}; static inline void nvmet_destroy_auth(struct nvmet_ctrl *ctrl) {};
static inline void nvmet_auth_sq_free(struct nvmet_sq *sq) {}; static inline void nvmet_auth_sq_free(struct nvmet_sq *sq) {};
static inline bool nvmet_check_auth_status(struct nvmet_req *req) static inline bool nvmet_check_auth_status(struct nvmet_req *req)

View File

@@ -215,9 +215,11 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
{ {
struct nvmet_req *req = container_of(w, struct nvmet_req, p.work); struct nvmet_req *req = container_of(w, struct nvmet_req, p.work);
struct request *rq = req->p.rq; struct request *rq = req->p.rq;
struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
u32 effects;
int status; int status;
status = nvme_execute_passthru_rq(rq); status = nvme_execute_passthru_rq(rq, &effects);
if (status == NVME_SC_SUCCESS && if (status == NVME_SC_SUCCESS &&
req->cmd->common.opcode == nvme_admin_identify) { req->cmd->common.opcode == nvme_admin_identify) {
@@ -238,6 +240,9 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
req->cqe->result = nvme_req(rq)->result; req->cqe->result = nvme_req(rq)->result;
nvmet_req_complete(req, status); nvmet_req_complete(req, status);
blk_mq_free_request(rq); blk_mq_free_request(rq);
if (effects)
nvme_passthru_end(ctrl, effects, req->cmd, status);
} }
static void nvmet_passthru_req_done(struct request *rq, static void nvmet_passthru_req_done(struct request *rq,

View File

@@ -77,9 +77,8 @@ struct nvmet_tcp_cmd {
u32 pdu_len; u32 pdu_len;
u32 pdu_recv; u32 pdu_recv;
int sg_idx; int sg_idx;
int nr_mapped;
struct msghdr recv_msg; struct msghdr recv_msg;
struct kvec *iov; struct bio_vec *iov;
u32 flags; u32 flags;
struct list_head entry; struct list_head entry;
@@ -165,9 +164,7 @@ static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
static struct workqueue_struct *nvmet_tcp_wq; static struct workqueue_struct *nvmet_tcp_wq;
static const struct nvmet_fabrics_ops nvmet_tcp_ops; static const struct nvmet_fabrics_ops nvmet_tcp_ops;
static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd); static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd);
static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd);
static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
struct nvmet_tcp_cmd *cmd) struct nvmet_tcp_cmd *cmd)
@@ -301,35 +298,21 @@ static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd) static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd)
{ {
WARN_ON(unlikely(cmd->nr_mapped > 0));
kfree(cmd->iov); kfree(cmd->iov);
sgl_free(cmd->req.sg); sgl_free(cmd->req.sg);
cmd->iov = NULL; cmd->iov = NULL;
cmd->req.sg = NULL; cmd->req.sg = NULL;
} }
static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd) static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd)
{ {
struct scatterlist *sg; struct bio_vec *iov = cmd->iov;
int i;
sg = &cmd->req.sg[cmd->sg_idx];
for (i = 0; i < cmd->nr_mapped; i++)
kunmap(sg_page(&sg[i]));
cmd->nr_mapped = 0;
}
static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
{
struct kvec *iov = cmd->iov;
struct scatterlist *sg; struct scatterlist *sg;
u32 length, offset, sg_offset; u32 length, offset, sg_offset;
int nr_pages;
length = cmd->pdu_len; length = cmd->pdu_len;
cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE); nr_pages = DIV_ROUND_UP(length, PAGE_SIZE);
offset = cmd->rbytes_done; offset = cmd->rbytes_done;
cmd->sg_idx = offset / PAGE_SIZE; cmd->sg_idx = offset / PAGE_SIZE;
sg_offset = offset % PAGE_SIZE; sg_offset = offset % PAGE_SIZE;
@@ -338,8 +321,9 @@ static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
while (length) { while (length) {
u32 iov_len = min_t(u32, length, sg->length - sg_offset); u32 iov_len = min_t(u32, length, sg->length - sg_offset);
iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset; iov->bv_page = sg_page(sg);
iov->iov_len = iov_len; iov->bv_len = sg->length;
iov->bv_offset = sg->offset + sg_offset;
length -= iov_len; length -= iov_len;
sg = sg_next(sg); sg = sg_next(sg);
@@ -347,8 +331,8 @@ static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
sg_offset = 0; sg_offset = 0;
} }
iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, iov_iter_bvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
cmd->nr_mapped, cmd->pdu_len); nr_pages, cmd->pdu_len);
} }
static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
@@ -926,7 +910,7 @@ static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
} }
queue->rcv_state = NVMET_TCP_RECV_DATA; queue->rcv_state = NVMET_TCP_RECV_DATA;
nvmet_tcp_map_pdu_iovec(cmd); nvmet_tcp_build_pdu_iovec(cmd);
cmd->flags |= NVMET_TCP_F_INIT_FAILED; cmd->flags |= NVMET_TCP_F_INIT_FAILED;
} }
@@ -935,10 +919,17 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
struct nvme_tcp_data_pdu *data = &queue->pdu.data; struct nvme_tcp_data_pdu *data = &queue->pdu.data;
struct nvmet_tcp_cmd *cmd; struct nvmet_tcp_cmd *cmd;
if (likely(queue->nr_cmds)) if (likely(queue->nr_cmds)) {
if (unlikely(data->ttag >= queue->nr_cmds)) {
pr_err("queue %d: received out of bound ttag %u, nr_cmds %u\n",
queue->idx, data->ttag, queue->nr_cmds);
nvmet_tcp_fatal_error(queue);
return -EPROTO;
}
cmd = &queue->cmds[data->ttag]; cmd = &queue->cmds[data->ttag];
else } else {
cmd = &queue->connect; cmd = &queue->connect;
}
if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
pr_err("ttag %u unexpected data offset %u (expected %u)\n", pr_err("ttag %u unexpected data offset %u (expected %u)\n",
@@ -952,7 +943,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
cmd->pdu_len = le32_to_cpu(data->data_length); cmd->pdu_len = le32_to_cpu(data->data_length);
cmd->pdu_recv = 0; cmd->pdu_recv = 0;
nvmet_tcp_map_pdu_iovec(cmd); nvmet_tcp_build_pdu_iovec(cmd);
queue->cmd = cmd; queue->cmd = cmd;
queue->rcv_state = NVMET_TCP_RECV_DATA; queue->rcv_state = NVMET_TCP_RECV_DATA;
@@ -976,6 +967,13 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
return nvmet_tcp_handle_icreq(queue); return nvmet_tcp_handle_icreq(queue);
} }
if (unlikely(hdr->type == nvme_tcp_icreq)) {
pr_err("queue %d: received icreq pdu in state %d\n",
queue->idx, queue->state);
nvmet_tcp_fatal_error(queue);
return -EPROTO;
}
if (hdr->type == nvme_tcp_h2c_data) { if (hdr->type == nvme_tcp_h2c_data) {
ret = nvmet_tcp_handle_h2c_data_pdu(queue); ret = nvmet_tcp_handle_h2c_data_pdu(queue);
if (unlikely(ret)) if (unlikely(ret))
@@ -1021,7 +1019,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
if (nvmet_tcp_need_data_in(queue->cmd)) { if (nvmet_tcp_need_data_in(queue->cmd)) {
if (nvmet_tcp_has_inline_data(queue->cmd)) { if (nvmet_tcp_has_inline_data(queue->cmd)) {
queue->rcv_state = NVMET_TCP_RECV_DATA; queue->rcv_state = NVMET_TCP_RECV_DATA;
nvmet_tcp_map_pdu_iovec(queue->cmd); nvmet_tcp_build_pdu_iovec(queue->cmd);
return 0; return 0;
} }
/* send back R2T */ /* send back R2T */
@@ -1141,7 +1139,6 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
cmd->rbytes_done += ret; cmd->rbytes_done += ret;
} }
nvmet_tcp_unmap_pdu_iovec(cmd);
if (queue->data_digest) { if (queue->data_digest) {
nvmet_tcp_prep_recv_ddgst(cmd); nvmet_tcp_prep_recv_ddgst(cmd);
return 0; return 0;
@@ -1179,7 +1176,8 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
queue->idx, cmd->req.cmd->common.command_id, queue->idx, cmd->req.cmd->common.command_id,
queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst), queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
le32_to_cpu(cmd->exp_ddgst)); le32_to_cpu(cmd->exp_ddgst));
nvmet_tcp_finish_cmd(cmd); nvmet_req_uninit(&cmd->req);
nvmet_tcp_free_cmd_buffers(cmd);
nvmet_tcp_fatal_error(queue); nvmet_tcp_fatal_error(queue);
ret = -EPROTO; ret = -EPROTO;
goto out; goto out;
@@ -1408,13 +1406,6 @@ static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
write_unlock_bh(&sock->sk->sk_callback_lock); write_unlock_bh(&sock->sk->sk_callback_lock);
} }
static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
{
nvmet_req_uninit(&cmd->req);
nvmet_tcp_unmap_pdu_iovec(cmd);
nvmet_tcp_free_cmd_buffers(cmd);
}
static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
{ {
struct nvmet_tcp_cmd *cmd = queue->cmds; struct nvmet_tcp_cmd *cmd = queue->cmds;
@@ -1423,17 +1414,28 @@ static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
for (i = 0; i < queue->nr_cmds; i++, cmd++) { for (i = 0; i < queue->nr_cmds; i++, cmd++) {
if (nvmet_tcp_need_data_in(cmd)) if (nvmet_tcp_need_data_in(cmd))
nvmet_req_uninit(&cmd->req); nvmet_req_uninit(&cmd->req);
nvmet_tcp_unmap_pdu_iovec(cmd);
nvmet_tcp_free_cmd_buffers(cmd);
} }
if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) { if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
/* failed in connect */ /* failed in connect */
nvmet_tcp_finish_cmd(&queue->connect); nvmet_req_uninit(&queue->connect.req);
} }
} }
static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
{
struct nvmet_tcp_cmd *cmd = queue->cmds;
int i;
for (i = 0; i < queue->nr_cmds; i++, cmd++) {
if (nvmet_tcp_need_data_in(cmd))
nvmet_tcp_free_cmd_buffers(cmd);
}
if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect))
nvmet_tcp_free_cmd_buffers(&queue->connect);
}
static void nvmet_tcp_release_queue_work(struct work_struct *w) static void nvmet_tcp_release_queue_work(struct work_struct *w)
{ {
struct page *page; struct page *page;
@@ -1452,6 +1454,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
nvmet_tcp_uninit_data_in_cmds(queue); nvmet_tcp_uninit_data_in_cmds(queue);
nvmet_sq_destroy(&queue->nvme_sq); nvmet_sq_destroy(&queue->nvme_sq);
cancel_work_sync(&queue->io_work); cancel_work_sync(&queue->io_work);
nvmet_tcp_free_cmd_data_in_buffers(queue);
sock_release(queue->sock); sock_release(queue->sock);
nvmet_tcp_free_cmds(queue); nvmet_tcp_free_cmds(queue);
if (queue->hdr_digest || queue->data_digest) if (queue->hdr_digest || queue->data_digest)

View File

@@ -400,7 +400,6 @@ static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req)
{ {
struct block_device *bdev = req->ns->bdev; struct block_device *bdev = req->ns->bdev;
unsigned int nr_zones = bdev_nr_zones(bdev); unsigned int nr_zones = bdev_nr_zones(bdev);
struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = NULL; struct bio *bio = NULL;
sector_t sector = 0; sector_t sector = 0;
int ret; int ret;
@@ -409,7 +408,7 @@ static u16 nvmet_bdev_zone_mgmt_emulate_all(struct nvmet_req *req)
}; };
d.zbitmap = kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(*(d.zbitmap)), d.zbitmap = kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(*(d.zbitmap)),
GFP_NOIO, q->node); GFP_NOIO, bdev->bd_disk->node_id);
if (!d.zbitmap) { if (!d.zbitmap) {
ret = -ENOMEM; ret = -ENOMEM;
goto out; goto out;

View File

@@ -41,15 +41,6 @@
#define DASD_DIAG_MOD "dasd_diag_mod" #define DASD_DIAG_MOD "dasd_diag_mod"
static unsigned int queue_depth = 32;
static unsigned int nr_hw_queues = 4;
module_param(queue_depth, uint, 0444);
MODULE_PARM_DESC(queue_depth, "Default queue depth for new DASD devices");
module_param(nr_hw_queues, uint, 0444);
MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD devices");
/* /*
* SECTION: exported variables of dasd.c * SECTION: exported variables of dasd.c
*/ */
@@ -68,8 +59,6 @@ MODULE_LICENSE("GPL");
/* /*
* SECTION: prototypes for static functions of dasd.c * SECTION: prototypes for static functions of dasd.c
*/ */
static int dasd_alloc_queue(struct dasd_block *);
static void dasd_free_queue(struct dasd_block *);
static int dasd_flush_block_queue(struct dasd_block *); static int dasd_flush_block_queue(struct dasd_block *);
static void dasd_device_tasklet(unsigned long); static void dasd_device_tasklet(unsigned long);
static void dasd_block_tasklet(unsigned long); static void dasd_block_tasklet(unsigned long);
@@ -198,21 +187,11 @@ EXPORT_SYMBOL_GPL(dasd_free_block);
*/ */
static int dasd_state_new_to_known(struct dasd_device *device) static int dasd_state_new_to_known(struct dasd_device *device)
{ {
int rc;
/* /*
* As long as the device is not in state DASD_STATE_NEW we want to * As long as the device is not in state DASD_STATE_NEW we want to
* keep the reference count > 0. * keep the reference count > 0.
*/ */
dasd_get_device(device); dasd_get_device(device);
if (device->block) {
rc = dasd_alloc_queue(device->block);
if (rc) {
dasd_put_device(device);
return rc;
}
}
device->state = DASD_STATE_KNOWN; device->state = DASD_STATE_KNOWN;
return 0; return 0;
} }
@@ -226,9 +205,6 @@ static int dasd_state_known_to_new(struct dasd_device *device)
dasd_eer_disable(device); dasd_eer_disable(device);
device->state = DASD_STATE_NEW; device->state = DASD_STATE_NEW;
if (device->block)
dasd_free_queue(device->block);
/* Give up reference we took in dasd_state_new_to_known. */ /* Give up reference we took in dasd_state_new_to_known. */
dasd_put_device(device); dasd_put_device(device);
return 0; return 0;
@@ -1591,9 +1567,8 @@ void dasd_generic_handle_state_change(struct dasd_device *device)
dasd_schedule_device_bh(device); dasd_schedule_device_bh(device);
if (device->block) { if (device->block) {
dasd_schedule_block_bh(device->block); dasd_schedule_block_bh(device->block);
if (device->block->request_queue) if (device->block->gdp)
blk_mq_run_hw_queues(device->block->request_queue, blk_mq_run_hw_queues(device->block->gdp->queue, true);
true);
} }
} }
EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change); EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change);
@@ -2691,7 +2666,7 @@ static void dasd_block_timeout(struct timer_list *t)
dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING); dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING);
spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags); spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags);
dasd_schedule_block_bh(block); dasd_schedule_block_bh(block);
blk_mq_run_hw_queues(block->request_queue, true); blk_mq_run_hw_queues(block->gdp->queue, true);
} }
/* /*
@@ -3239,7 +3214,7 @@ static void dasd_request_done(struct request *req)
blk_mq_run_hw_queues(req->q, true); blk_mq_run_hw_queues(req->q, true);
} }
static struct blk_mq_ops dasd_mq_ops = { struct blk_mq_ops dasd_mq_ops = {
.queue_rq = do_dasd_request, .queue_rq = do_dasd_request,
.complete = dasd_request_done, .complete = dasd_request_done,
.timeout = dasd_times_out, .timeout = dasd_times_out,
@@ -3247,45 +3222,6 @@ static struct blk_mq_ops dasd_mq_ops = {
.exit_hctx = dasd_exit_hctx, .exit_hctx = dasd_exit_hctx,
}; };
/*
* Allocate and initialize request queue and default I/O scheduler.
*/
static int dasd_alloc_queue(struct dasd_block *block)
{
int rc;
block->tag_set.ops = &dasd_mq_ops;
block->tag_set.cmd_size = sizeof(struct dasd_ccw_req);
block->tag_set.nr_hw_queues = nr_hw_queues;
block->tag_set.queue_depth = queue_depth;
block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
block->tag_set.numa_node = NUMA_NO_NODE;
rc = blk_mq_alloc_tag_set(&block->tag_set);
if (rc)
return rc;
block->request_queue = blk_mq_init_queue(&block->tag_set);
if (IS_ERR(block->request_queue))
return PTR_ERR(block->request_queue);
block->request_queue->queuedata = block;
return 0;
}
/*
* Deactivate and free request queue.
*/
static void dasd_free_queue(struct dasd_block *block)
{
if (block->request_queue) {
blk_mq_destroy_queue(block->request_queue);
blk_mq_free_tag_set(&block->tag_set);
block->request_queue = NULL;
}
}
static int dasd_open(struct block_device *bdev, fmode_t mode) static int dasd_open(struct block_device *bdev, fmode_t mode)
{ {
struct dasd_device *base; struct dasd_device *base;
@@ -3762,10 +3698,9 @@ int dasd_generic_path_operational(struct dasd_device *device)
dasd_schedule_device_bh(device); dasd_schedule_device_bh(device);
if (device->block) { if (device->block) {
dasd_schedule_block_bh(device->block); dasd_schedule_block_bh(device->block);
if (device->block->request_queue) if (device->block->gdp)
blk_mq_run_hw_queues(device->block->request_queue, blk_mq_run_hw_queues(device->block->gdp->queue, true);
true); }
}
if (!device->stopped) if (!device->stopped)
wake_up(&generic_waitq); wake_up(&generic_waitq);
@@ -3916,8 +3851,8 @@ void dasd_generic_space_avail(struct dasd_device *device)
if (device->block) { if (device->block) {
dasd_schedule_block_bh(device->block); dasd_schedule_block_bh(device->block);
if (device->block->request_queue) if (device->block->gdp)
blk_mq_run_hw_queues(device->block->request_queue, true); blk_mq_run_hw_queues(device->block->gdp->queue, true);
} }
if (!device->stopped) if (!device->stopped)
wake_up(&generic_waitq); wake_up(&generic_waitq);
@@ -3927,7 +3862,7 @@ EXPORT_SYMBOL_GPL(dasd_generic_space_avail);
/* /*
* clear active requests and requeue them to block layer if possible * clear active requests and requeue them to block layer if possible
*/ */
static int dasd_generic_requeue_all_requests(struct dasd_device *device) int dasd_generic_requeue_all_requests(struct dasd_device *device)
{ {
struct list_head requeue_queue; struct list_head requeue_queue;
struct dasd_ccw_req *cqr, *n; struct dasd_ccw_req *cqr, *n;
@@ -4001,6 +3936,7 @@ static int dasd_generic_requeue_all_requests(struct dasd_device *device)
dasd_schedule_device_bh(device); dasd_schedule_device_bh(device);
return rc; return rc;
} }
EXPORT_SYMBOL_GPL(dasd_generic_requeue_all_requests);
static void do_requeue_requests(struct work_struct *work) static void do_requeue_requests(struct work_struct *work)
{ {

View File

@@ -1050,6 +1050,11 @@ dasd_3990_erp_com_rej(struct dasd_ccw_req * erp, char *sense)
dev_err(&device->cdev->dev, "An I/O request was rejected" dev_err(&device->cdev->dev, "An I/O request was rejected"
" because writing is inhibited\n"); " because writing is inhibited\n");
erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED);
} else if (sense[7] & SNS7_INVALID_ON_SEC) {
dev_err(&device->cdev->dev, "An I/O request was rejected on a copy pair secondary device\n");
/* suppress dump of sense data for this error */
set_bit(DASD_CQR_SUPPRESS_CR, &erp->refers->flags);
erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED);
} else { } else {
/* fatal error - set status to FAILED /* fatal error - set status to FAILED
internal error 09 - Command Reject */ internal error 09 - Command Reject */

View File

@@ -26,7 +26,6 @@
/* This is ugly... */ /* This is ugly... */
#define PRINTK_HEADER "dasd_devmap:" #define PRINTK_HEADER "dasd_devmap:"
#define DASD_BUS_ID_SIZE 20
#define DASD_MAX_PARAMS 256 #define DASD_MAX_PARAMS 256
#include "dasd_int.h" #include "dasd_int.h"
@@ -50,6 +49,7 @@ struct dasd_devmap {
unsigned int devindex; unsigned int devindex;
unsigned short features; unsigned short features;
struct dasd_device *device; struct dasd_device *device;
struct dasd_copy_relation *copy;
}; };
/* /*
@@ -130,7 +130,7 @@ __setup ("dasd=", dasd_call_setup);
/* /*
* Read a device busid/devno from a string. * Read a device busid/devno from a string.
*/ */
static int __init dasd_busid(char *str, int *id0, int *id1, int *devno) static int dasd_busid(char *str, int *id0, int *id1, int *devno)
{ {
unsigned int val; unsigned int val;
char *tok; char *tok;
@@ -438,16 +438,12 @@ dasd_add_busid(const char *bus_id, int features)
return devmap; return devmap;
} }
/*
* Find devmap for device with given bus_id.
*/
static struct dasd_devmap * static struct dasd_devmap *
dasd_find_busid(const char *bus_id) dasd_find_busid_locked(const char *bus_id)
{ {
struct dasd_devmap *devmap, *tmp; struct dasd_devmap *devmap, *tmp;
int hash; int hash;
spin_lock(&dasd_devmap_lock);
devmap = ERR_PTR(-ENODEV); devmap = ERR_PTR(-ENODEV);
hash = dasd_hash_busid(bus_id); hash = dasd_hash_busid(bus_id);
list_for_each_entry(tmp, &dasd_hashlists[hash], list) { list_for_each_entry(tmp, &dasd_hashlists[hash], list) {
@@ -456,6 +452,19 @@ dasd_find_busid(const char *bus_id)
break; break;
} }
} }
return devmap;
}
/*
* Find devmap for device with given bus_id.
*/
static struct dasd_devmap *
dasd_find_busid(const char *bus_id)
{
struct dasd_devmap *devmap;
spin_lock(&dasd_devmap_lock);
devmap = dasd_find_busid_locked(bus_id);
spin_unlock(&dasd_devmap_lock); spin_unlock(&dasd_devmap_lock);
return devmap; return devmap;
} }
@@ -584,6 +593,238 @@ dasd_create_device(struct ccw_device *cdev)
return device; return device;
} }
/*
* allocate a PPRC data structure and call the discipline function to fill
*/
static int dasd_devmap_get_pprc_status(struct dasd_device *device,
struct dasd_pprc_data_sc4 **data)
{
struct dasd_pprc_data_sc4 *temp;
if (!device->discipline || !device->discipline->pprc_status) {
dev_warn(&device->cdev->dev, "Unable to query copy relation status\n");
return -EOPNOTSUPP;
}
temp = kzalloc(sizeof(*temp), GFP_KERNEL);
if (!temp)
return -ENOMEM;
/* get PPRC information from storage */
if (device->discipline->pprc_status(device, temp)) {
dev_warn(&device->cdev->dev, "Error during copy relation status query\n");
kfree(temp);
return -EINVAL;
}
*data = temp;
return 0;
}
/*
* find an entry in a PPRC device_info array by a given UID
* depending on the primary/secondary state of the device it has to be
* matched with the respective fields
*/
static int dasd_devmap_entry_from_pprc_data(struct dasd_pprc_data_sc4 *data,
struct dasd_uid uid,
bool primary)
{
int i;
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (primary) {
if (data->dev_info[i].prim_cu_ssid == uid.ssid &&
data->dev_info[i].primary == uid.real_unit_addr)
return i;
} else {
if (data->dev_info[i].sec_cu_ssid == uid.ssid &&
data->dev_info[i].secondary == uid.real_unit_addr)
return i;
}
}
return -1;
}
/*
* check the consistency of a specified copy relation by checking
* the following things:
*
* - is the given device part of a copy pair setup
* - does the state of the device match the state in the PPRC status data
* - does the device UID match with the UID in the PPRC status data
* - to prevent misrouted IO check if the given device is present in all
* related PPRC status data
*/
static int dasd_devmap_check_copy_relation(struct dasd_device *device,
struct dasd_copy_entry *entry,
struct dasd_pprc_data_sc4 *data,
struct dasd_copy_relation *copy)
{
struct dasd_pprc_data_sc4 *tmp_dat;
struct dasd_device *tmp_dev;
struct dasd_uid uid;
int i, j;
if (!device->discipline || !device->discipline->get_uid ||
device->discipline->get_uid(device, &uid))
return 1;
i = dasd_devmap_entry_from_pprc_data(data, uid, entry->primary);
if (i < 0) {
dev_warn(&device->cdev->dev, "Device not part of a copy relation\n");
return 1;
}
/* double check which role the current device has */
if (entry->primary) {
if (data->dev_info[i].flags & 0x80) {
dev_warn(&device->cdev->dev, "Copy pair secondary is setup as primary\n");
return 1;
}
if (data->dev_info[i].prim_cu_ssid != uid.ssid ||
data->dev_info[i].primary != uid.real_unit_addr) {
dev_warn(&device->cdev->dev,
"Primary device %s does not match copy pair status primary device %04x\n",
dev_name(&device->cdev->dev),
data->dev_info[i].prim_cu_ssid |
data->dev_info[i].primary);
return 1;
}
} else {
if (!(data->dev_info[i].flags & 0x80)) {
dev_warn(&device->cdev->dev, "Copy pair primary is setup as secondary\n");
return 1;
}
if (data->dev_info[i].sec_cu_ssid != uid.ssid ||
data->dev_info[i].secondary != uid.real_unit_addr) {
dev_warn(&device->cdev->dev,
"Secondary device %s does not match copy pair status secondary device %04x\n",
dev_name(&device->cdev->dev),
data->dev_info[i].sec_cu_ssid |
data->dev_info[i].secondary);
return 1;
}
}
/*
* the current device has to be part of the copy relation of all
* entries to prevent misrouted IO to another copy pair
*/
for (j = 0; j < DASD_CP_ENTRIES; j++) {
if (entry == &copy->entry[j])
tmp_dev = device;
else
tmp_dev = copy->entry[j].device;
if (!tmp_dev)
continue;
if (dasd_devmap_get_pprc_status(tmp_dev, &tmp_dat))
return 1;
if (dasd_devmap_entry_from_pprc_data(tmp_dat, uid, entry->primary) < 0) {
dev_warn(&tmp_dev->cdev->dev,
"Copy pair relation does not contain device: %s\n",
dev_name(&device->cdev->dev));
kfree(tmp_dat);
return 1;
}
kfree(tmp_dat);
}
return 0;
}
/* delete device from copy relation entry */
static void dasd_devmap_delete_copy_relation_device(struct dasd_device *device)
{
struct dasd_copy_relation *copy;
int i;
if (!device->copy)
return;
copy = device->copy;
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (copy->entry[i].device == device)
copy->entry[i].device = NULL;
}
dasd_put_device(device);
device->copy = NULL;
}
/*
* read all required information for a copy relation setup and setup the device
* accordingly
*/
int dasd_devmap_set_device_copy_relation(struct ccw_device *cdev,
bool pprc_enabled)
{
struct dasd_pprc_data_sc4 *data = NULL;
struct dasd_copy_entry *entry = NULL;
struct dasd_copy_relation *copy;
struct dasd_devmap *devmap;
struct dasd_device *device;
int i, rc = 0;
devmap = dasd_devmap_from_cdev(cdev);
if (IS_ERR(devmap))
return PTR_ERR(devmap);
device = devmap->device;
if (!device)
return -ENODEV;
copy = devmap->copy;
/* no copy pair setup for this device */
if (!copy)
goto out;
rc = dasd_devmap_get_pprc_status(device, &data);
if (rc)
return rc;
/* print error if PPRC is requested but not enabled on storage server */
if (!pprc_enabled) {
dev_err(&cdev->dev, "Copy relation not enabled on storage server\n");
rc = -EINVAL;
goto out;
}
if (!data->dev_info[0].state) {
dev_warn(&device->cdev->dev, "Copy pair setup requested for device not in copy relation\n");
rc = -EINVAL;
goto out;
}
/* find entry */
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (copy->entry[i].configured &&
strncmp(dev_name(&cdev->dev),
copy->entry[i].busid, DASD_BUS_ID_SIZE) == 0) {
entry = &copy->entry[i];
break;
}
}
if (!entry) {
dev_warn(&device->cdev->dev, "Copy relation entry not found\n");
rc = -EINVAL;
goto out;
}
/* check if the copy relation is valid */
if (dasd_devmap_check_copy_relation(device, entry, data, copy)) {
dev_warn(&device->cdev->dev, "Copy relation faulty\n");
rc = -EINVAL;
goto out;
}
dasd_get_device(device);
copy->entry[i].device = device;
device->copy = copy;
out:
kfree(data);
return rc;
}
EXPORT_SYMBOL_GPL(dasd_devmap_set_device_copy_relation);
/* /*
* Wait queue for dasd_delete_device waits. * Wait queue for dasd_delete_device waits.
*/ */
@@ -617,6 +858,8 @@ dasd_delete_device(struct dasd_device *device)
dev_set_drvdata(&device->cdev->dev, NULL); dev_set_drvdata(&device->cdev->dev, NULL);
spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags); spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags);
/* Removve copy relation */
dasd_devmap_delete_copy_relation_device(device);
/* /*
* Drop ref_count by 3, one for the devmap reference, one for * Drop ref_count by 3, one for the devmap reference, one for
* the cdev reference and one for the passed reference. * the cdev reference and one for the passed reference.
@@ -694,6 +937,7 @@ void dasd_add_link_to_gendisk(struct gendisk *gdp, struct dasd_device *device)
gdp->private_data = devmap; gdp->private_data = devmap;
spin_unlock(&dasd_devmap_lock); spin_unlock(&dasd_devmap_lock);
} }
EXPORT_SYMBOL(dasd_add_link_to_gendisk);
struct dasd_device *dasd_device_from_gendisk(struct gendisk *gdp) struct dasd_device *dasd_device_from_gendisk(struct gendisk *gdp)
{ {
@@ -1334,7 +1578,6 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count) const char *buf, size_t count)
{ {
struct dasd_device *device; struct dasd_device *device;
struct request_queue *q;
unsigned long val; unsigned long val;
device = dasd_device_from_cdev(to_ccwdev(dev)); device = dasd_device_from_cdev(to_ccwdev(dev));
@@ -1346,15 +1589,13 @@ dasd_timeout_store(struct device *dev, struct device_attribute *attr,
dasd_put_device(device); dasd_put_device(device);
return -EINVAL; return -EINVAL;
} }
q = device->block->request_queue; if (!device->block->gdp) {
if (!q) {
dasd_put_device(device); dasd_put_device(device);
return -ENODEV; return -ENODEV;
} }
device->blk_timeout = val; device->blk_timeout = val;
blk_queue_rq_timeout(device->block->gdp->queue, val * HZ);
blk_queue_rq_timeout(q, device->blk_timeout * HZ);
dasd_put_device(device); dasd_put_device(device);
return count; return count;
@@ -1683,6 +1924,347 @@ dasd_path_fcs_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
static struct kobj_attribute path_fcs_attribute = static struct kobj_attribute path_fcs_attribute =
__ATTR(fc_security, 0444, dasd_path_fcs_show, NULL); __ATTR(fc_security, 0444, dasd_path_fcs_show, NULL);
/*
* print copy relation in the form
* primary,secondary[1] primary,secondary[2], ...
*/
static ssize_t
dasd_copy_pair_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
char prim_busid[DASD_BUS_ID_SIZE];
struct dasd_copy_relation *copy;
struct dasd_devmap *devmap;
int len = 0;
int i;
devmap = dasd_find_busid(dev_name(dev));
if (IS_ERR(devmap))
return -ENODEV;
if (!devmap->copy)
return -ENODEV;
copy = devmap->copy;
/* find primary */
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (copy->entry[i].configured && copy->entry[i].primary) {
strscpy(prim_busid, copy->entry[i].busid,
DASD_BUS_ID_SIZE);
break;
}
}
if (!copy->entry[i].primary)
goto out;
/* print all secondary */
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (copy->entry[i].configured && !copy->entry[i].primary)
len += sysfs_emit_at(buf, len, "%s,%s ", prim_busid,
copy->entry[i].busid);
}
len += sysfs_emit_at(buf, len, "\n");
out:
return len;
}
static int dasd_devmap_set_copy_relation(struct dasd_devmap *devmap,
struct dasd_copy_relation *copy,
char *busid, bool primary)
{
int i;
/* find free entry */
for (i = 0; i < DASD_CP_ENTRIES; i++) {
/* current bus_id already included, nothing to do */
if (copy->entry[i].configured &&
strncmp(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE) == 0)
return 0;
if (!copy->entry[i].configured)
break;
}
if (i == DASD_CP_ENTRIES)
return -EINVAL;
copy->entry[i].configured = true;
strscpy(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE);
if (primary) {
copy->active = &copy->entry[i];
copy->entry[i].primary = true;
}
if (!devmap->copy)
devmap->copy = copy;
return 0;
}
static void dasd_devmap_del_copy_relation(struct dasd_copy_relation *copy,
char *busid)
{
int i;
spin_lock(&dasd_devmap_lock);
/* find entry */
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (copy->entry[i].configured &&
strncmp(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE) == 0)
break;
}
if (i == DASD_CP_ENTRIES || !copy->entry[i].configured) {
spin_unlock(&dasd_devmap_lock);
return;
}
copy->entry[i].configured = false;
memset(copy->entry[i].busid, 0, DASD_BUS_ID_SIZE);
if (copy->active == &copy->entry[i]) {
copy->active = NULL;
copy->entry[i].primary = false;
}
spin_unlock(&dasd_devmap_lock);
}
static int dasd_devmap_clear_copy_relation(struct device *dev)
{
struct dasd_copy_relation *copy;
struct dasd_devmap *devmap;
int i, rc = 1;
devmap = dasd_devmap_from_cdev(to_ccwdev(dev));
if (IS_ERR(devmap))
return 1;
spin_lock(&dasd_devmap_lock);
if (!devmap->copy)
goto out;
copy = devmap->copy;
/* first check if all secondary devices are offline*/
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (!copy->entry[i].configured)
continue;
if (copy->entry[i].device == copy->active->device)
continue;
if (copy->entry[i].device)
goto out;
}
/* clear all devmap entries */
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (strlen(copy->entry[i].busid) == 0)
continue;
if (copy->entry[i].device) {
dasd_put_device(copy->entry[i].device);
copy->entry[i].device->copy = NULL;
copy->entry[i].device = NULL;
}
devmap = dasd_find_busid_locked(copy->entry[i].busid);
devmap->copy = NULL;
memset(copy->entry[i].busid, 0, DASD_BUS_ID_SIZE);
}
kfree(copy);
rc = 0;
out:
spin_unlock(&dasd_devmap_lock);
return rc;
}
/*
* parse BUSIDs from a copy pair
*/
static int dasd_devmap_parse_busid(const char *buf, char *prim_busid,
char *sec_busid)
{
char *primary, *secondary, *tmp, *pt;
int id0, id1, id2;
pt = kstrdup(buf, GFP_KERNEL);
tmp = pt;
if (!tmp)
return -ENOMEM;
primary = strsep(&tmp, ",");
if (!primary) {
kfree(pt);
return -EINVAL;
}
secondary = strsep(&tmp, ",");
if (!secondary) {
kfree(pt);
return -EINVAL;
}
if (dasd_busid(primary, &id0, &id1, &id2)) {
kfree(pt);
return -EINVAL;
}
sprintf(prim_busid, "%01x.%01x.%04x", id0, id1, id2);
if (dasd_busid(secondary, &id0, &id1, &id2)) {
kfree(pt);
return -EINVAL;
}
sprintf(sec_busid, "%01x.%01x.%04x", id0, id1, id2);
kfree(pt);
return 0;
}
static ssize_t dasd_copy_pair_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct dasd_devmap *prim_devmap, *sec_devmap;
char prim_busid[DASD_BUS_ID_SIZE];
char sec_busid[DASD_BUS_ID_SIZE];
struct dasd_copy_relation *copy;
struct dasd_device *device;
bool pprc_enabled;
int rc;
if (strncmp(buf, "clear", strlen("clear")) == 0) {
if (dasd_devmap_clear_copy_relation(dev))
return -EINVAL;
return count;
}
rc = dasd_devmap_parse_busid(buf, prim_busid, sec_busid);
if (rc)
return rc;
if (strncmp(dev_name(dev), prim_busid, DASD_BUS_ID_SIZE) != 0 &&
strncmp(dev_name(dev), sec_busid, DASD_BUS_ID_SIZE) != 0)
return -EINVAL;
/* allocate primary devmap if needed */
prim_devmap = dasd_find_busid(prim_busid);
if (IS_ERR(prim_devmap))
prim_devmap = dasd_add_busid(prim_busid, DASD_FEATURE_DEFAULT);
/* allocate secondary devmap if needed */
sec_devmap = dasd_find_busid(sec_busid);
if (IS_ERR(sec_devmap))
sec_devmap = dasd_add_busid(sec_busid, DASD_FEATURE_DEFAULT);
/* setting copy relation is only allowed for offline secondary */
if (sec_devmap->device)
return -EINVAL;
if (prim_devmap->copy) {
copy = prim_devmap->copy;
} else if (sec_devmap->copy) {
copy = sec_devmap->copy;
} else {
copy = kzalloc(sizeof(*copy), GFP_KERNEL);
if (!copy)
return -ENOMEM;
}
spin_lock(&dasd_devmap_lock);
rc = dasd_devmap_set_copy_relation(prim_devmap, copy, prim_busid, true);
if (rc) {
spin_unlock(&dasd_devmap_lock);
return rc;
}
rc = dasd_devmap_set_copy_relation(sec_devmap, copy, sec_busid, false);
if (rc) {
spin_unlock(&dasd_devmap_lock);
return rc;
}
spin_unlock(&dasd_devmap_lock);
/* if primary device is already online call device setup directly */
if (prim_devmap->device && !prim_devmap->device->copy) {
device = prim_devmap->device;
if (device->discipline->pprc_enabled) {
pprc_enabled = device->discipline->pprc_enabled(device);
rc = dasd_devmap_set_device_copy_relation(device->cdev,
pprc_enabled);
} else {
rc = -EOPNOTSUPP;
}
}
if (rc) {
dasd_devmap_del_copy_relation(copy, prim_busid);
dasd_devmap_del_copy_relation(copy, sec_busid);
count = rc;
}
return count;
}
static DEVICE_ATTR(copy_pair, 0644, dasd_copy_pair_show,
dasd_copy_pair_store);
static ssize_t
dasd_copy_role_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct dasd_copy_relation *copy;
struct dasd_device *device;
int len, i;
device = dasd_device_from_cdev(to_ccwdev(dev));
if (IS_ERR(device))
return -ENODEV;
if (!device->copy) {
len = sysfs_emit(buf, "none\n");
goto out;
}
copy = device->copy;
/* only the active device is primary */
if (copy->active->device == device) {
len = sysfs_emit(buf, "primary\n");
goto out;
}
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (copy->entry[i].device == device) {
len = sysfs_emit(buf, "secondary\n");
goto out;
}
}
/* not in the list, no COPY role */
len = sysfs_emit(buf, "none\n");
out:
dasd_put_device(device);
return len;
}
static DEVICE_ATTR(copy_role, 0444, dasd_copy_role_show, NULL);
static ssize_t dasd_device_ping(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
struct dasd_device *device;
size_t rc;
device = dasd_device_from_cdev(to_ccwdev(dev));
if (IS_ERR(device))
return -ENODEV;
/*
* do not try during offline processing
* early check only
* the sleep_on function itself checks for offline
* processing again
*/
if (test_bit(DASD_FLAG_OFFLINE, &device->flags)) {
rc = -EBUSY;
goto out;
}
if (!device->discipline || !device->discipline->device_ping) {
rc = -EOPNOTSUPP;
goto out;
}
rc = device->discipline->device_ping(device);
if (!rc)
rc = count;
out:
dasd_put_device(device);
return rc;
}
static DEVICE_ATTR(ping, 0200, NULL, dasd_device_ping);
#define DASD_DEFINE_ATTR(_name, _func) \ #define DASD_DEFINE_ATTR(_name, _func) \
static ssize_t dasd_##_name##_show(struct device *dev, \ static ssize_t dasd_##_name##_show(struct device *dev, \
struct device_attribute *attr, \ struct device_attribute *attr, \
@@ -1739,6 +2321,9 @@ static struct attribute * dasd_attrs[] = {
&dev_attr_hpf.attr, &dev_attr_hpf.attr,
&dev_attr_ese.attr, &dev_attr_ese.attr,
&dev_attr_fc_security.attr, &dev_attr_fc_security.attr,
&dev_attr_copy_pair.attr,
&dev_attr_copy_role.attr,
&dev_attr_ping.attr,
NULL, NULL,
}; };

View File

@@ -627,7 +627,7 @@ dasd_diag_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req,
static void dasd_diag_setup_blk_queue(struct dasd_block *block) static void dasd_diag_setup_blk_queue(struct dasd_block *block)
{ {
unsigned int logical_block_size = block->bp_block; unsigned int logical_block_size = block->bp_block;
struct request_queue *q = block->request_queue; struct request_queue *q = block->gdp->queue;
int max; int max;
max = DIAG_MAX_BLOCKS << block->s2b_shift; max = DIAG_MAX_BLOCKS << block->s2b_shift;

View File

@@ -2012,6 +2012,49 @@ static void dasd_eckd_kick_validate_server(struct dasd_device *device)
dasd_put_device(device); dasd_put_device(device);
} }
/*
* return if the device is the copy relation primary if a copy relation is active
*/
static int dasd_device_is_primary(struct dasd_device *device)
{
if (!device->copy)
return 1;
if (device->copy->active->device == device)
return 1;
return 0;
}
static int dasd_eckd_alloc_block(struct dasd_device *device)
{
struct dasd_block *block;
struct dasd_uid temp_uid;
if (!dasd_device_is_primary(device))
return 0;
dasd_eckd_get_uid(device, &temp_uid);
if (temp_uid.type == UA_BASE_DEVICE) {
block = dasd_alloc_block();
if (IS_ERR(block)) {
DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s",
"could not allocate dasd block structure");
return PTR_ERR(block);
}
device->block = block;
block->base = device;
}
return 0;
}
static bool dasd_eckd_pprc_enabled(struct dasd_device *device)
{
struct dasd_eckd_private *private = device->private;
return private->rdc_data.facilities.PPRC_enabled;
}
/* /*
* Check device characteristics. * Check device characteristics.
* If the device is accessible using ECKD discipline, the device is enabled. * If the device is accessible using ECKD discipline, the device is enabled.
@@ -2020,8 +2063,6 @@ static int
dasd_eckd_check_characteristics(struct dasd_device *device) dasd_eckd_check_characteristics(struct dasd_device *device)
{ {
struct dasd_eckd_private *private = device->private; struct dasd_eckd_private *private = device->private;
struct dasd_block *block;
struct dasd_uid temp_uid;
int rc, i; int rc, i;
int readonly; int readonly;
unsigned long value; unsigned long value;
@@ -2079,20 +2120,29 @@ dasd_eckd_check_characteristics(struct dasd_device *device)
device->default_expires = value; device->default_expires = value;
} }
dasd_eckd_get_uid(device, &temp_uid); /* Read Device Characteristics */
if (temp_uid.type == UA_BASE_DEVICE) { rc = dasd_generic_read_dev_chars(device, DASD_ECKD_MAGIC,
block = dasd_alloc_block(); &private->rdc_data, 64);
if (IS_ERR(block)) { if (rc) {
DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", DBF_EVENT_DEVID(DBF_WARNING, device->cdev,
"could not allocate dasd " "Read device characteristic failed, rc=%d", rc);
"block structure"); goto out_err1;
rc = PTR_ERR(block);
goto out_err1;
}
device->block = block;
block->base = device;
} }
/* setup PPRC for device from devmap */
rc = dasd_devmap_set_device_copy_relation(device->cdev,
dasd_eckd_pprc_enabled(device));
if (rc) {
DBF_EVENT_DEVID(DBF_WARNING, device->cdev,
"copy relation setup failed, rc=%d", rc);
goto out_err1;
}
/* check if block device is needed and allocate in case */
rc = dasd_eckd_alloc_block(device);
if (rc)
goto out_err1;
/* register lcu with alias handling, enable PAV */ /* register lcu with alias handling, enable PAV */
rc = dasd_alias_make_device_known_to_lcu(device); rc = dasd_alias_make_device_known_to_lcu(device);
if (rc) if (rc)
@@ -2117,15 +2167,6 @@ dasd_eckd_check_characteristics(struct dasd_device *device)
/* Read Extent Pool Information */ /* Read Extent Pool Information */
dasd_eckd_read_ext_pool_info(device); dasd_eckd_read_ext_pool_info(device);
/* Read Device Characteristics */
rc = dasd_generic_read_dev_chars(device, DASD_ECKD_MAGIC,
&private->rdc_data, 64);
if (rc) {
DBF_EVENT_DEVID(DBF_WARNING, device->cdev,
"Read device characteristic failed, rc=%d", rc);
goto out_err3;
}
if ((device->features & DASD_FEATURE_USERAW) && if ((device->features & DASD_FEATURE_USERAW) &&
!(private->rdc_data.facilities.RT_in_LR)) { !(private->rdc_data.facilities.RT_in_LR)) {
dev_err(&device->cdev->dev, "The storage server does not " dev_err(&device->cdev->dev, "The storage server does not "
@@ -6078,6 +6119,207 @@ static int dasd_hosts_print(struct dasd_device *device, struct seq_file *m)
return 0; return 0;
} }
static struct dasd_device
*copy_relation_find_device(struct dasd_copy_relation *copy,
char *busid)
{
int i;
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (copy->entry[i].configured &&
strncmp(copy->entry[i].busid, busid, DASD_BUS_ID_SIZE) == 0)
return copy->entry[i].device;
}
return NULL;
}
/*
* set the new active/primary device
*/
static void copy_pair_set_active(struct dasd_copy_relation *copy, char *new_busid,
char *old_busid)
{
int i;
for (i = 0; i < DASD_CP_ENTRIES; i++) {
if (copy->entry[i].configured &&
strncmp(copy->entry[i].busid, new_busid,
DASD_BUS_ID_SIZE) == 0) {
copy->active = &copy->entry[i];
copy->entry[i].primary = true;
} else if (copy->entry[i].configured &&
strncmp(copy->entry[i].busid, old_busid,
DASD_BUS_ID_SIZE) == 0) {
copy->entry[i].primary = false;
}
}
}
/*
* The function will swap the role of a given copy pair.
* During the swap operation the relation of the blockdevice is disconnected
* from the old primary and connected to the new.
*
* IO is paused on the block queue before swap and may be resumed afterwards.
*/
static int dasd_eckd_copy_pair_swap(struct dasd_device *device, char *prim_busid,
char *sec_busid)
{
struct dasd_device *primary, *secondary;
struct dasd_copy_relation *copy;
struct dasd_block *block;
struct gendisk *gdp;
copy = device->copy;
if (!copy)
return DASD_COPYPAIRSWAP_INVALID;
primary = copy->active->device;
if (!primary)
return DASD_COPYPAIRSWAP_INVALID;
/* double check if swap has correct primary */
if (strncmp(dev_name(&primary->cdev->dev), prim_busid, DASD_BUS_ID_SIZE) != 0)
return DASD_COPYPAIRSWAP_PRIMARY;
secondary = copy_relation_find_device(copy, sec_busid);
if (!secondary)
return DASD_COPYPAIRSWAP_SECONDARY;
/*
* usually the device should be quiesced for swap
* for paranoia stop device and requeue requests again
*/
dasd_device_set_stop_bits(primary, DASD_STOPPED_PPRC);
dasd_device_set_stop_bits(secondary, DASD_STOPPED_PPRC);
dasd_generic_requeue_all_requests(primary);
/* swap DASD internal device <> block assignment */
block = primary->block;
primary->block = NULL;
secondary->block = block;
block->base = secondary;
/* set new primary device in COPY relation */
copy_pair_set_active(copy, sec_busid, prim_busid);
/* swap blocklayer device link */
gdp = block->gdp;
dasd_add_link_to_gendisk(gdp, secondary);
/* re-enable device */
dasd_device_remove_stop_bits(primary, DASD_STOPPED_PPRC);
dasd_device_remove_stop_bits(secondary, DASD_STOPPED_PPRC);
dasd_schedule_device_bh(secondary);
return DASD_COPYPAIRSWAP_SUCCESS;
}
/*
* Perform Subsystem Function - Peer-to-Peer Remote Copy Extended Query
*/
static int dasd_eckd_query_pprc_status(struct dasd_device *device,
struct dasd_pprc_data_sc4 *data)
{
struct dasd_pprc_data_sc4 *pprc_data;
struct dasd_psf_prssd_data *prssdp;
struct dasd_ccw_req *cqr;
struct ccw1 *ccw;
int rc;
cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1 /* PSF */ + 1 /* RSSD */,
sizeof(*prssdp) + sizeof(*pprc_data) + 1,
device, NULL);
if (IS_ERR(cqr)) {
DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s",
"Could not allocate query PPRC status request");
return PTR_ERR(cqr);
}
cqr->startdev = device;
cqr->memdev = device;
cqr->block = NULL;
cqr->retries = 256;
cqr->expires = 10 * HZ;
/* Prepare for Read Subsystem Data */
prssdp = (struct dasd_psf_prssd_data *)cqr->data;
memset(prssdp, 0, sizeof(struct dasd_psf_prssd_data));
prssdp->order = PSF_ORDER_PRSSD;
prssdp->suborder = PSF_SUBORDER_PPRCEQ;
prssdp->varies[0] = PPRCEQ_SCOPE_4;
pprc_data = (struct dasd_pprc_data_sc4 *)(prssdp + 1);
ccw = cqr->cpaddr;
ccw->cmd_code = DASD_ECKD_CCW_PSF;
ccw->count = sizeof(struct dasd_psf_prssd_data);
ccw->flags |= CCW_FLAG_CC;
ccw->flags |= CCW_FLAG_SLI;
ccw->cda = (__u32)(addr_t)prssdp;
/* Read Subsystem Data - query host access */
ccw++;
ccw->cmd_code = DASD_ECKD_CCW_RSSD;
ccw->count = sizeof(*pprc_data);
ccw->flags |= CCW_FLAG_SLI;
ccw->cda = (__u32)(addr_t)pprc_data;
cqr->buildclk = get_tod_clock();
cqr->status = DASD_CQR_FILLED;
rc = dasd_sleep_on_interruptible(cqr);
if (rc == 0) {
*data = *pprc_data;
} else {
DBF_EVENT_DEVID(DBF_WARNING, device->cdev,
"PPRC Extended Query failed with rc=%d\n",
rc);
rc = -EOPNOTSUPP;
}
dasd_sfree_request(cqr, cqr->memdev);
return rc;
}
/*
* ECKD NOP - no operation
*/
static int dasd_eckd_nop(struct dasd_device *device)
{
struct dasd_ccw_req *cqr;
struct ccw1 *ccw;
int rc;
cqr = dasd_smalloc_request(DASD_ECKD_MAGIC, 1, 1, device, NULL);
if (IS_ERR(cqr)) {
DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s",
"Could not allocate NOP request");
return PTR_ERR(cqr);
}
cqr->startdev = device;
cqr->memdev = device;
cqr->block = NULL;
cqr->retries = 1;
cqr->expires = 10 * HZ;
ccw = cqr->cpaddr;
ccw->cmd_code = DASD_ECKD_CCW_NOP;
ccw->flags |= CCW_FLAG_SLI;
cqr->buildclk = get_tod_clock();
cqr->status = DASD_CQR_FILLED;
rc = dasd_sleep_on_interruptible(cqr);
if (rc != 0) {
DBF_EVENT_DEVID(DBF_WARNING, device->cdev,
"NOP failed with rc=%d\n", rc);
rc = -EOPNOTSUPP;
}
dasd_sfree_request(cqr, cqr->memdev);
return rc;
}
static int dasd_eckd_device_ping(struct dasd_device *device)
{
return dasd_eckd_nop(device);
}
/* /*
* Perform Subsystem Function - CUIR response * Perform Subsystem Function - CUIR response
*/ */
@@ -6602,7 +6844,7 @@ static void dasd_eckd_handle_hpf_error(struct dasd_device *device,
static void dasd_eckd_setup_blk_queue(struct dasd_block *block) static void dasd_eckd_setup_blk_queue(struct dasd_block *block)
{ {
unsigned int logical_block_size = block->bp_block; unsigned int logical_block_size = block->bp_block;
struct request_queue *q = block->request_queue; struct request_queue *q = block->gdp->queue;
struct dasd_device *device = block->base; struct dasd_device *device = block->base;
int max; int max;
@@ -6697,6 +6939,10 @@ static struct dasd_discipline dasd_eckd_discipline = {
.ext_pool_exhaust = dasd_eckd_ext_pool_exhaust, .ext_pool_exhaust = dasd_eckd_ext_pool_exhaust,
.ese_format = dasd_eckd_ese_format, .ese_format = dasd_eckd_ese_format,
.ese_read = dasd_eckd_ese_read, .ese_read = dasd_eckd_ese_read,
.pprc_status = dasd_eckd_query_pprc_status,
.pprc_enabled = dasd_eckd_pprc_enabled,
.copy_pair_swap = dasd_eckd_copy_pair_swap,
.device_ping = dasd_eckd_device_ping,
}; };
static int __init static int __init

View File

@@ -13,6 +13,7 @@
/***************************************************************************** /*****************************************************************************
* SECTION: CCW Definitions * SECTION: CCW Definitions
****************************************************************************/ ****************************************************************************/
#define DASD_ECKD_CCW_NOP 0x03
#define DASD_ECKD_CCW_WRITE 0x05 #define DASD_ECKD_CCW_WRITE 0x05
#define DASD_ECKD_CCW_READ 0x06 #define DASD_ECKD_CCW_READ 0x06
#define DASD_ECKD_CCW_WRITE_HOME_ADDRESS 0x09 #define DASD_ECKD_CCW_WRITE_HOME_ADDRESS 0x09
@@ -66,9 +67,15 @@
* Perform Subsystem Function / Sub-Orders * Perform Subsystem Function / Sub-Orders
*/ */
#define PSF_SUBORDER_QHA 0x1C /* Query Host Access */ #define PSF_SUBORDER_QHA 0x1C /* Query Host Access */
#define PSF_SUBORDER_PPRCEQ 0x50 /* PPRC Extended Query */
#define PSF_SUBORDER_VSQ 0x52 /* Volume Storage Query */ #define PSF_SUBORDER_VSQ 0x52 /* Volume Storage Query */
#define PSF_SUBORDER_LCQ 0x53 /* Logical Configuration Query */ #define PSF_SUBORDER_LCQ 0x53 /* Logical Configuration Query */
/*
* PPRC Extended Query Scopes
*/
#define PPRCEQ_SCOPE_4 0x04 /* Scope 4 for PPRC Extended Query */
/* /*
* CUIR response condition codes * CUIR response condition codes
*/ */
@@ -261,7 +268,7 @@ struct dasd_eckd_characteristics {
unsigned char reserved3:8; unsigned char reserved3:8;
unsigned char defect_wr:1; unsigned char defect_wr:1;
unsigned char XRC_supported:1; unsigned char XRC_supported:1;
unsigned char reserved4:1; unsigned char PPRC_enabled:1;
unsigned char striping:1; unsigned char striping:1;
unsigned char reserved5:4; unsigned char reserved5:4;
unsigned char cfw:1; unsigned char cfw:1;

View File

@@ -767,7 +767,7 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req,
static void dasd_fba_setup_blk_queue(struct dasd_block *block) static void dasd_fba_setup_blk_queue(struct dasd_block *block)
{ {
unsigned int logical_block_size = block->bp_block; unsigned int logical_block_size = block->bp_block;
struct request_queue *q = block->request_queue; struct request_queue *q = block->gdp->queue;
unsigned int max_bytes, max_discard_sectors; unsigned int max_bytes, max_discard_sectors;
int max; int max;

View File

@@ -25,7 +25,14 @@
#include "dasd_int.h" #include "dasd_int.h"
static struct lock_class_key dasd_bio_compl_lkclass; static unsigned int queue_depth = 32;
static unsigned int nr_hw_queues = 4;
module_param(queue_depth, uint, 0444);
MODULE_PARM_DESC(queue_depth, "Default queue depth for new DASD devices");
module_param(nr_hw_queues, uint, 0444);
MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD devices");
/* /*
* Allocate and register gendisk structure for device. * Allocate and register gendisk structure for device.
@@ -41,10 +48,21 @@ int dasd_gendisk_alloc(struct dasd_block *block)
if (base->devindex >= DASD_PER_MAJOR) if (base->devindex >= DASD_PER_MAJOR)
return -EBUSY; return -EBUSY;
gdp = blk_mq_alloc_disk_for_queue(block->request_queue, block->tag_set.ops = &dasd_mq_ops;
&dasd_bio_compl_lkclass); block->tag_set.cmd_size = sizeof(struct dasd_ccw_req);
if (!gdp) block->tag_set.nr_hw_queues = nr_hw_queues;
return -ENOMEM; block->tag_set.queue_depth = queue_depth;
block->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
block->tag_set.numa_node = NUMA_NO_NODE;
rc = blk_mq_alloc_tag_set(&block->tag_set);
if (rc)
return rc;
gdp = blk_mq_alloc_disk(&block->tag_set, block);
if (IS_ERR(gdp)) {
blk_mq_free_tag_set(&block->tag_set);
return PTR_ERR(gdp);
}
/* Initialize gendisk structure. */ /* Initialize gendisk structure. */
gdp->major = DASD_MAJOR; gdp->major = DASD_MAJOR;
@@ -100,6 +118,7 @@ void dasd_gendisk_free(struct dasd_block *block)
block->gdp->private_data = NULL; block->gdp->private_data = NULL;
put_disk(block->gdp); put_disk(block->gdp);
block->gdp = NULL; block->gdp = NULL;
blk_mq_free_tag_set(&block->tag_set);
} }
} }

View File

@@ -259,6 +259,55 @@ struct dasd_uid {
char vduit[33]; char vduit[33];
}; };
/*
* PPRC Status data
*/
struct dasd_pprc_header {
__u8 entries; /* 0 Number of device entries */
__u8 unused; /* 1 unused */
__u16 entry_length; /* 2-3 Length of device entry */
__u32 unused2; /* 4-7 unused */
} __packed;
struct dasd_pprc_dev_info {
__u8 state; /* 0 Copy State */
__u8 flags; /* 1 Flags */
__u8 reserved1[2]; /* 2-3 reserved */
__u8 prim_lss; /* 4 Primary device LSS */
__u8 primary; /* 5 Primary device address */
__u8 sec_lss; /* 6 Secondary device LSS */
__u8 secondary; /* 7 Secondary device address */
__u16 pprc_id; /* 8-9 Peer-to-Peer Remote Copy ID */
__u8 reserved2[12]; /* 10-21 reserved */
__u16 prim_cu_ssid; /* 22-23 Pimary Control Unit SSID */
__u8 reserved3[12]; /* 24-35 reserved */
__u16 sec_cu_ssid; /* 36-37 Secondary Control Unit SSID */
__u8 reserved4[90]; /* 38-127 reserved */
} __packed;
struct dasd_pprc_data_sc4 {
struct dasd_pprc_header header;
struct dasd_pprc_dev_info dev_info[5];
} __packed;
#define DASD_BUS_ID_SIZE 20
#define DASD_CP_ENTRIES 5
struct dasd_copy_entry {
char busid[DASD_BUS_ID_SIZE];
struct dasd_device *device;
bool primary;
bool configured;
};
struct dasd_copy_relation {
struct dasd_copy_entry entry[DASD_CP_ENTRIES];
struct dasd_copy_entry *active;
};
int dasd_devmap_set_device_copy_relation(struct ccw_device *,
bool pprc_enabled);
/* /*
* the struct dasd_discipline is * the struct dasd_discipline is
* sth like a table of virtual functions, if you think of dasd_eckd * sth like a table of virtual functions, if you think of dasd_eckd
@@ -387,6 +436,10 @@ struct dasd_discipline {
struct dasd_ccw_req *(*ese_format)(struct dasd_device *, struct dasd_ccw_req *(*ese_format)(struct dasd_device *,
struct dasd_ccw_req *, struct irb *); struct dasd_ccw_req *, struct irb *);
int (*ese_read)(struct dasd_ccw_req *, struct irb *); int (*ese_read)(struct dasd_ccw_req *, struct irb *);
int (*pprc_status)(struct dasd_device *, struct dasd_pprc_data_sc4 *);
bool (*pprc_enabled)(struct dasd_device *);
int (*copy_pair_swap)(struct dasd_device *, char *, char *);
int (*device_ping)(struct dasd_device *);
}; };
extern struct dasd_discipline *dasd_diag_discipline_pointer; extern struct dasd_discipline *dasd_diag_discipline_pointer;
@@ -583,12 +636,12 @@ struct dasd_device {
struct dasd_profile profile; struct dasd_profile profile;
struct dasd_format_entry format_entry; struct dasd_format_entry format_entry;
struct kset *paths_info; struct kset *paths_info;
struct dasd_copy_relation *copy;
}; };
struct dasd_block { struct dasd_block {
/* Block device stuff. */ /* Block device stuff. */
struct gendisk *gdp; struct gendisk *gdp;
struct request_queue *request_queue;
spinlock_t request_queue_lock; spinlock_t request_queue_lock;
struct blk_mq_tag_set tag_set; struct blk_mq_tag_set tag_set;
struct block_device *bdev; struct block_device *bdev;
@@ -629,6 +682,7 @@ struct dasd_queue {
#define DASD_STOPPED_PENDING 4 /* long busy */ #define DASD_STOPPED_PENDING 4 /* long busy */
#define DASD_STOPPED_DC_WAIT 8 /* disconnected, wait */ #define DASD_STOPPED_DC_WAIT 8 /* disconnected, wait */
#define DASD_STOPPED_SU 16 /* summary unit check handling */ #define DASD_STOPPED_SU 16 /* summary unit check handling */
#define DASD_STOPPED_PPRC 32 /* PPRC swap */
#define DASD_STOPPED_NOSPC 128 /* no space left */ #define DASD_STOPPED_NOSPC 128 /* no space left */
/* per device flags */ /* per device flags */
@@ -653,6 +707,22 @@ struct dasd_queue {
void dasd_put_device_wake(struct dasd_device *); void dasd_put_device_wake(struct dasd_device *);
/*
* return values to be returned from the copy pair swap function
* 0x00: swap successful
* 0x01: swap data invalid
* 0x02: no active device found
* 0x03: wrong primary specified
* 0x04: secondary device not found
* 0x05: swap already running
*/
#define DASD_COPYPAIRSWAP_SUCCESS 0
#define DASD_COPYPAIRSWAP_INVALID 1
#define DASD_COPYPAIRSWAP_NOACTIVE 2
#define DASD_COPYPAIRSWAP_PRIMARY 3
#define DASD_COPYPAIRSWAP_SECONDARY 4
#define DASD_COPYPAIRSWAP_MULTIPLE 5
/* /*
* Reference count inliners * Reference count inliners
*/ */
@@ -779,6 +849,7 @@ extern debug_info_t *dasd_debug_area;
extern struct dasd_profile dasd_global_profile; extern struct dasd_profile dasd_global_profile;
extern unsigned int dasd_global_profile_level; extern unsigned int dasd_global_profile_level;
extern const struct block_device_operations dasd_device_operations; extern const struct block_device_operations dasd_device_operations;
extern struct blk_mq_ops dasd_mq_ops;
extern struct kmem_cache *dasd_page_cache; extern struct kmem_cache *dasd_page_cache;
@@ -837,6 +908,8 @@ int dasd_generic_verify_path(struct dasd_device *, __u8);
void dasd_generic_space_exhaust(struct dasd_device *, struct dasd_ccw_req *); void dasd_generic_space_exhaust(struct dasd_device *, struct dasd_ccw_req *);
void dasd_generic_space_avail(struct dasd_device *); void dasd_generic_space_avail(struct dasd_device *);
int dasd_generic_requeue_all_requests(struct dasd_device *);
int dasd_generic_read_dev_chars(struct dasd_device *, int, void *, int); int dasd_generic_read_dev_chars(struct dasd_device *, int, void *, int);
char *dasd_get_sense(struct irb *); char *dasd_get_sense(struct irb *);

View File

@@ -379,6 +379,56 @@ out_err:
return rc; return rc;
} }
/*
* Swap driver iternal copy relation.
*/
static int
dasd_ioctl_copy_pair_swap(struct block_device *bdev, void __user *argp)
{
struct dasd_copypair_swap_data_t data;
struct dasd_device *device;
int rc;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
device = dasd_device_from_gendisk(bdev->bd_disk);
if (!device)
return -ENODEV;
if (copy_from_user(&data, argp, sizeof(struct dasd_copypair_swap_data_t))) {
dasd_put_device(device);
return -EFAULT;
}
if (memchr_inv(data.reserved, 0, sizeof(data.reserved))) {
pr_warn("%s: Ivalid swap data specified.\n",
dev_name(&device->cdev->dev));
dasd_put_device(device);
return DASD_COPYPAIRSWAP_INVALID;
}
if (bdev_is_partition(bdev)) {
pr_warn("%s: The specified DASD is a partition and cannot be swapped\n",
dev_name(&device->cdev->dev));
dasd_put_device(device);
return DASD_COPYPAIRSWAP_INVALID;
}
if (!device->copy) {
pr_warn("%s: The specified DASD has no copy pair set up\n",
dev_name(&device->cdev->dev));
dasd_put_device(device);
return -ENODEV;
}
if (!device->discipline->copy_pair_swap) {
dasd_put_device(device);
return -EOPNOTSUPP;
}
rc = device->discipline->copy_pair_swap(device, data.primary,
data.secondary);
dasd_put_device(device);
return rc;
}
#ifdef CONFIG_DASD_PROFILE #ifdef CONFIG_DASD_PROFILE
/* /*
* Reset device profile information * Reset device profile information
@@ -637,6 +687,9 @@ int dasd_ioctl(struct block_device *bdev, fmode_t mode,
case BIODASDRAS: case BIODASDRAS:
rc = dasd_ioctl_release_space(bdev, argp); rc = dasd_ioctl_release_space(bdev, argp);
break; break;
case BIODASDCOPYPAIRSWAP:
rc = dasd_ioctl_copy_pair_swap(bdev, argp);
break;
default: default:
/* if the discipline has an ioctl method try it. */ /* if the discipline has an ioctl method try it. */
rc = -ENOTTY; rc = -ENOTTY;

View File

@@ -3537,7 +3537,7 @@ static struct attribute *host_v2_hw_attrs[] = {
ATTRIBUTE_GROUPS(host_v2_hw); ATTRIBUTE_GROUPS(host_v2_hw);
static int map_queues_v2_hw(struct Scsi_Host *shost) static void map_queues_v2_hw(struct Scsi_Host *shost)
{ {
struct hisi_hba *hisi_hba = shost_priv(shost); struct hisi_hba *hisi_hba = shost_priv(shost);
struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT];
@@ -3552,9 +3552,6 @@ static int map_queues_v2_hw(struct Scsi_Host *shost)
for_each_cpu(cpu, mask) for_each_cpu(cpu, mask)
qmap->mq_map[cpu] = qmap->queue_offset + queue; qmap->mq_map[cpu] = qmap->queue_offset + queue;
} }
return 0;
} }
static struct scsi_host_template sht_v2_hw = { static struct scsi_host_template sht_v2_hw = {

View File

@@ -3171,13 +3171,12 @@ static int debugfs_set_bist_v3_hw(struct hisi_hba *hisi_hba, bool enable)
return 0; return 0;
} }
static int hisi_sas_map_queues(struct Scsi_Host *shost) static void hisi_sas_map_queues(struct Scsi_Host *shost)
{ {
struct hisi_hba *hisi_hba = shost_priv(shost); struct hisi_hba *hisi_hba = shost_priv(shost);
struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT]; struct blk_mq_queue_map *qmap = &shost->tag_set.map[HCTX_TYPE_DEFAULT];
return blk_mq_pci_map_queues(qmap, hisi_hba->pci_dev, blk_mq_pci_map_queues(qmap, hisi_hba->pci_dev, BASE_VECTORS_V3_HW);
BASE_VECTORS_V3_HW);
} }
static struct scsi_host_template sht_v3_hw = { static struct scsi_host_template sht_v3_hw = {

Some files were not shown because too many files have changed in this diff Show More