From c21f53f17430230dab50df29b8ea1b71f99d09d6 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 7 Apr 2015 13:39:12 +0200 Subject: [PATCH 01/51] Add BFQ-v8r12 This commit is the result of the following operations. 1. The squash of all the commits between "block: cgroups, kconfig, build bits for BFQ-v7r11-4.5.0" and BFQ-v8r12 in the branch bfq-mq-v8-v4.11 2. The renaming of two files (block/bfq-cgroup.c -> block/bfq-cgroup-included.c and block/bfq-iosched.c -> block/bfq-sq-iosched.c) and of one option (CONFIG_BFQ_GROUP_IOSCHED -> CONFIG_BFQ_SQ_GROUP_IOSCHED), to avoid name clashes. These name clashes are due to the presence of bfq in mainline from 4.12. 3. The modification of block/Makefile and block/Kconfig.iosched to comply with the above renaming. Signed-off-by: Mauro Andreolini Signed-off-by: Arianna Avanzini Signed-off-by: Linus Walleij Signed-off-by: Paolo Valente --- Makefile | 2 +- block/Kconfig.iosched | 31 + block/bfq-cgroup-included.c | 1190 ++++++++++ block/bfq-ioc.c | 36 + block/bfq-sched.c | 2002 ++++++++++++++++ block/bfq-sq-iosched.c | 5379 +++++++++++++++++++++++++++++++++++++++++++ block/bfq.h | 948 ++++++++ include/linux/blkdev.h | 2 +- 9 files changed, 9589 insertions(+), 2 deletions(-) create mode 100644 block/bfq-cgroup-included.c create mode 100644 block/bfq-ioc.c create mode 100644 block/bfq-sched.c create mode 100644 block/bfq-sq-iosched.c create mode 100644 block/bfq.h diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index a4a8914bf7a4..9e3f4c2f7390 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -40,6 +40,26 @@ config CFQ_GROUP_IOSCHED ---help--- Enable group IO scheduling in CFQ. +config IOSCHED_BFQ_SQ + tristate "BFQ-SQ I/O scheduler" + default n + ---help--- + The BFQ-SQ I/O scheduler (for legacy blk: SQ stands for + SingleQueue) distributes bandwidth among all processes + according to their weights, regardless of the device + parameters and with any workload. It also guarantees a low + latency to interactive and soft real-time applications. + Details in Documentation/block/bfq-iosched.txt + +config BFQ_SQ_GROUP_IOSCHED + bool "BFQ-SQ hierarchical scheduling support" + depends on IOSCHED_BFQ_SQ && BLK_CGROUP + default n + ---help--- + + Enable hierarchical scheduling in BFQ-SQ, using the blkio + (cgroups-v1) or io (cgroups-v2) controller. + choice prompt "Default I/O scheduler" @@ -54,6 +74,16 @@ choice config DEFAULT_CFQ bool "CFQ" if IOSCHED_CFQ=y + config DEFAULT_BFQ_SQ + bool "BFQ-SQ" if IOSCHED_BFQ_SQ=y + help + Selects BFQ-SQ as the default I/O scheduler which will be + used by default for all block devices. + The BFQ-SQ I/O scheduler aims at distributing the bandwidth + as desired, independently of the disk parameters and with + any workload. It also tries to guarantee low latency to + interactive and soft real-time applications. + config DEFAULT_NOOP bool "No-op" @@ -63,6 +93,7 @@ config DEFAULT_IOSCHED string default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ + default "bfq-sq" if DEFAULT_BFQ_SQ default "noop" if DEFAULT_NOOP config MQ_IOSCHED_DEADLINE diff --git a/block/Makefile b/block/Makefile index 6a56303b9925..59026b425791 100644 --- a/block/Makefile +++ b/block/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o +obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c new file mode 100644 index 000000000000..af7c216a3540 --- /dev/null +++ b/block/bfq-cgroup-included.c @@ -0,0 +1,1190 @@ +/* + * BFQ: CGROUPS support. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2016 Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. + */ + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + +/* bfqg stats flags */ +enum bfqg_stats_flags { + BFQG_stats_waiting = 0, + BFQG_stats_idling, + BFQG_stats_empty, +}; + +#define BFQG_FLAG_FNS(name) \ +static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ +{ \ + stats->flags |= (1 << BFQG_stats_##name); \ +} \ +static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ +{ \ + stats->flags &= ~(1 << BFQG_stats_##name); \ +} \ +static int bfqg_stats_##name(struct bfqg_stats *stats) \ +{ \ + return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ +} \ + +BFQG_FLAG_FNS(waiting) +BFQG_FLAG_FNS(idling) +BFQG_FLAG_FNS(empty) +#undef BFQG_FLAG_FNS + +/* This should be called with the queue_lock held. */ +static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) +{ + unsigned long long now; + + if (!bfqg_stats_waiting(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_group_wait_time)) + blkg_stat_add(&stats->group_wait_time, + now - stats->start_group_wait_time); + bfqg_stats_clear_waiting(stats); +} + +/* This should be called with the queue_lock held. */ +static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, + struct bfq_group *curr_bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + if (bfqg_stats_waiting(stats)) + return; + if (bfqg == curr_bfqg) + return; + stats->start_group_wait_time = sched_clock(); + bfqg_stats_mark_waiting(stats); +} + +/* This should be called with the queue_lock held. */ +static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) +{ + unsigned long long now; + + if (!bfqg_stats_empty(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_empty_time)) + blkg_stat_add(&stats->empty_time, + now - stats->start_empty_time); + bfqg_stats_clear_empty(stats); +} + +static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) +{ + blkg_stat_add(&bfqg->stats.dequeue, 1); +} + +static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + if (blkg_rwstat_total(&stats->queued)) + return; + + /* + * group is already marked empty. This can happen if bfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. + */ + if (bfqg_stats_empty(stats)) + return; + + stats->start_empty_time = sched_clock(); + bfqg_stats_mark_empty(stats); +} + +static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + if (bfqg_stats_idling(stats)) { + unsigned long long now = sched_clock(); + + if (time_after64(now, stats->start_idle_time)) + blkg_stat_add(&stats->idle_time, + now - stats->start_idle_time); + bfqg_stats_clear_idling(stats); + } +} + +static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + stats->start_idle_time = sched_clock(); + bfqg_stats_mark_idling(stats); +} + +static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + blkg_stat_add(&stats->avg_queue_size_sum, + blkg_rwstat_total(&stats->queued)); + blkg_stat_add(&stats->avg_queue_size_samples, 1); + bfqg_stats_update_group_wait_time(stats); +} + +static struct blkcg_policy blkcg_policy_bfq; + +/* + * blk-cgroup policy-related handlers + * The following functions help in converting between blk-cgroup + * internal structures and BFQ-specific structures. + */ + +static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct bfq_group, pd) : NULL; +} + +static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) +{ + return pd_to_blkg(&bfqg->pd); +} + +static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) +{ + struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); + + return pd_to_bfqg(pd); +} + +/* + * bfq_group handlers + * The following functions help in navigating the bfq_group hierarchy + * by allowing to find the parent of a bfq_group or the bfq_group + * associated to a bfq_queue. + */ + +static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) +{ + struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; + + return pblkg ? blkg_to_bfqg(pblkg) : NULL; +} + +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) +{ + struct bfq_entity *group_entity = bfqq->entity.parent; + + return group_entity ? container_of(group_entity, struct bfq_group, + entity) : + bfqq->bfqd->root_group; +} + +/* + * The following two functions handle get and put of a bfq_group by + * wrapping the related blk-cgroup hooks. + */ + +static void bfqg_get(struct bfq_group *bfqg) +{ + return blkg_get(bfqg_to_blkg(bfqg)); +} + +static void bfqg_put(struct bfq_group *bfqg) +{ + return blkg_put(bfqg_to_blkg(bfqg)); +} + +static void bfqg_stats_update_io_add(struct bfq_group *bfqg, + struct bfq_queue *bfqq, + unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.queued, op, 1); + bfqg_stats_end_empty_time(&bfqg->stats); + if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) + bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); +} + +static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.queued, op, -1); +} + +static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.merged, op, 1); +} + +static void bfqg_stats_update_completion(struct bfq_group *bfqg, + uint64_t start_time, uint64_t io_start_time, + unsigned int op) +{ + struct bfqg_stats *stats = &bfqg->stats; + unsigned long long now = sched_clock(); + + if (time_after64(now, io_start_time)) + blkg_rwstat_add(&stats->service_time, op, + now - io_start_time); + if (time_after64(io_start_time, start_time)) + blkg_rwstat_add(&stats->wait_time, op, + io_start_time - start_time); +} + +/* @stats = 0 */ +static void bfqg_stats_reset(struct bfqg_stats *stats) +{ + /* queued stats shouldn't be cleared */ + blkg_rwstat_reset(&stats->merged); + blkg_rwstat_reset(&stats->service_time); + blkg_rwstat_reset(&stats->wait_time); + blkg_stat_reset(&stats->time); + blkg_stat_reset(&stats->avg_queue_size_sum); + blkg_stat_reset(&stats->avg_queue_size_samples); + blkg_stat_reset(&stats->dequeue); + blkg_stat_reset(&stats->group_wait_time); + blkg_stat_reset(&stats->idle_time); + blkg_stat_reset(&stats->empty_time); +} + +/* @to += @from */ +static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) +{ + if (!to || !from) + return; + + /* queued stats shouldn't be cleared */ + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + blkg_stat_add_aux(&from->time, &from->time); + blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_add_aux(&to->avg_queue_size_samples, + &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->dequeue, &from->dequeue); + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); + blkg_stat_add_aux(&to->empty_time, &from->empty_time); +} + +/* + * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' + * recursive stats can still account for the amount used by this bfqg after + * it's gone. + */ +static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) +{ + struct bfq_group *parent; + + if (!bfqg) /* root_group */ + return; + + parent = bfqg_parent(bfqg); + + lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); + + if (unlikely(!parent)) + return; + + bfqg_stats_add_aux(&parent->stats, &bfqg->stats); + bfqg_stats_reset(&bfqg->stats); +} + +static void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + if (bfqq) { + bfqq->ioprio = bfqq->new_ioprio; + bfqq->ioprio_class = bfqq->new_ioprio_class; + bfqg_get(bfqg); + } + entity->parent = bfqg->my_entity; /* NULL for root group */ + entity->sched_data = &bfqg->sched_data; +} + +static void bfqg_stats_exit(struct bfqg_stats *stats) +{ + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); + blkg_stat_exit(&stats->time); + blkg_stat_exit(&stats->avg_queue_size_sum); + blkg_stat_exit(&stats->avg_queue_size_samples); + blkg_stat_exit(&stats->dequeue); + blkg_stat_exit(&stats->group_wait_time); + blkg_stat_exit(&stats->idle_time); + blkg_stat_exit(&stats->empty_time); +} + +static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) +{ + if (blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || + blkg_stat_init(&stats->time, gfp) || + blkg_stat_init(&stats->avg_queue_size_sum, gfp) || + blkg_stat_init(&stats->avg_queue_size_samples, gfp) || + blkg_stat_init(&stats->dequeue, gfp) || + blkg_stat_init(&stats->group_wait_time, gfp) || + blkg_stat_init(&stats->idle_time, gfp) || + blkg_stat_init(&stats->empty_time, gfp)) { + bfqg_stats_exit(stats); + return -ENOMEM; + } + + return 0; +} + +static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) +{ + return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; +} + +static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) +{ + return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); +} + +static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) +{ + struct bfq_group_data *bgd; + + bgd = kzalloc(sizeof(*bgd), gfp); + if (!bgd) + return NULL; + return &bgd->pd; +} + +static void bfq_cpd_init(struct blkcg_policy_data *cpd) +{ + struct bfq_group_data *d = cpd_to_bfqgd(cpd); + + d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? + CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; +} + +static void bfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_bfqgd(cpd)); +} + +static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) +{ + struct bfq_group *bfqg; + + bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); + if (!bfqg) + return NULL; + + if (bfqg_stats_init(&bfqg->stats, gfp)) { + kfree(bfqg); + return NULL; + } + + return &bfqg->pd; +} + +static void bfq_pd_init(struct blkg_policy_data *pd) +{ + struct blkcg_gq *blkg; + struct bfq_group *bfqg; + struct bfq_data *bfqd; + struct bfq_entity *entity; + struct bfq_group_data *d; + + blkg = pd_to_blkg(pd); + BUG_ON(!blkg); + bfqg = blkg_to_bfqg(blkg); + bfqd = blkg->q->elevator->elevator_data; + entity = &bfqg->entity; + d = blkcg_to_bfqgd(blkg->blkcg); + + entity->orig_weight = entity->weight = entity->new_weight = d->weight; + entity->my_sched_data = &bfqg->sched_data; + bfqg->my_entity = entity; /* + * the root_group's will be set to NULL + * in bfq_init_queue() + */ + bfqg->bfqd = bfqd; + bfqg->active_entities = 0; + bfqg->rq_pos_tree = RB_ROOT; +} + +static void bfq_pd_free(struct blkg_policy_data *pd) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + + bfqg_stats_exit(&bfqg->stats); + return kfree(bfqg); +} + +static void bfq_pd_reset_stats(struct blkg_policy_data *pd) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + + bfqg_stats_reset(&bfqg->stats); +} + +static void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) +{ + struct bfq_entity *entity; + + BUG_ON(!parent); + BUG_ON(!bfqg); + BUG_ON(bfqg == parent); + + entity = &bfqg->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; +} + +static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, + struct blkcg *blkcg) +{ + struct blkcg_gq *blkg; + + blkg = blkg_lookup(blkcg, bfqd->queue); + if (likely(blkg)) + return blkg_to_bfqg(blkg); + return NULL; +} + +static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, + struct blkcg *blkcg) +{ + struct bfq_group *bfqg, *parent; + struct bfq_entity *entity; + + assert_spin_locked(bfqd->queue->queue_lock); + + bfqg = bfq_lookup_bfqg(bfqd, blkcg); + + if (unlikely(!bfqg)) + return NULL; + + /* + * Update chain of bfq_groups as we might be handling a leaf group + * which, along with some of its relatives, has not been hooked yet + * to the private hierarchy of BFQ. + */ + entity = &bfqg->entity; + for_each_entity(entity) { + bfqg = container_of(entity, struct bfq_group, entity); + BUG_ON(!bfqg); + if (bfqg != bfqd->root_group) { + parent = bfqg_parent(bfqg); + if (!parent) + parent = bfqd->root_group; + BUG_ON(!parent); + bfq_group_set_parent(bfqg, parent); + } + } + + return bfqg; +} + +static void bfq_pos_tree_add_move(struct bfq_data *bfqd, + struct bfq_queue *bfqq); + +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason); + +/** + * bfq_bfqq_move - migrate @bfqq to @bfqg. + * @bfqd: queue descriptor. + * @bfqq: the queue to move. + * @bfqg: the group to move to. + * + * Move @bfqq to @bfqg, deactivating it from its old group and reactivating + * it on the new one. Avoid putting the entity on the old group idle tree. + * + * Must be called under the queue lock; the cgroup owning @bfqg must + * not disappear (by now this just means that we are called under + * rcu_read_lock()). + */ +static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_group *bfqg) +{ + struct bfq_entity *entity = &bfqq->entity; + + BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); + BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) + && entity->on_st && + bfqq != bfqd->in_service_queue); + BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); + + /* If bfqq is empty, then bfq_bfqq_expire also invokes + * bfq_del_bfqq_busy, thereby removing bfqq and its entity + * from data structures related to current group. Otherwise we + * need to remove bfqq explicitly with bfq_deactivate_bfqq, as + * we do below. + */ + if (bfqq == bfqd->in_service_queue) + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQ_BFQQ_PREEMPTED); + + BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) + && &bfq_entity_service_tree(entity)->idle != + entity->tree); + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); + + if (bfq_bfqq_busy(bfqq)) + bfq_deactivate_bfqq(bfqd, bfqq, false, false); + else if (entity->on_st) { + BUG_ON(&bfq_entity_service_tree(entity)->idle != + entity->tree); + bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + } + bfqg_put(bfqq_group(bfqq)); + + /* + * Here we use a reference to bfqg. We don't need a refcounter + * as the cgroup reference will not be dropped, so that its + * destroy() callback will not be invoked. + */ + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; + bfqg_get(bfqg); + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); + if (bfq_bfqq_busy(bfqq)) { + bfq_pos_tree_add_move(bfqd, bfqq); + bfq_activate_bfqq(bfqd, bfqq); + } + + if (!bfqd->in_service_queue && !bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) + && &bfq_entity_service_tree(entity)->idle != + entity->tree); +} + +/** + * __bfq_bic_change_cgroup - move @bic to @cgroup. + * @bfqd: the queue descriptor. + * @bic: the bic to move. + * @blkcg: the blk-cgroup to move to. + * + * Move bic to blkcg, assuming that bfqd->queue is locked; the caller + * has to make sure that the reference to cgroup is valid across the call. + * + * NOTE: an alternative approach might have been to store the current + * cgroup in bfqq and getting a reference to it, reducing the lookup + * time here, at the price of slightly more complex code. + */ +static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct blkcg *blkcg) +{ + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); + struct bfq_group *bfqg; + struct bfq_entity *entity; + + lockdep_assert_held(bfqd->queue->queue_lock); + + bfqg = bfq_find_set_group(bfqd, blkcg); + + if (unlikely(!bfqg)) + bfqg = bfqd->root_group; + + if (async_bfqq) { + entity = &async_bfqq->entity; + + if (entity->sched_data != &bfqg->sched_data) { + bic_set_bfqq(bic, NULL, 0); + bfq_log_bfqq(bfqd, async_bfqq, + "bic_change_group: %p %d", + async_bfqq, + async_bfqq->ref); + bfq_put_queue(async_bfqq); + } + } + + if (sync_bfqq) { + entity = &sync_bfqq->entity; + if (entity->sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + } + + return bfqg; +} + +static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) +{ + struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_group *bfqg = NULL; + uint64_t serial_nr; + + rcu_read_lock(); + serial_nr = bio_blkcg(bio)->css.serial_nr; + + /* + * Check whether blkcg has changed. The condition may trigger + * spuriously on a newly created cic but there's no harm. + */ + if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) + goto out; + + bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); + bic->blkcg_serial_nr = serial_nr; +out: + rcu_read_unlock(); +} + +/** + * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. + * @st: the service tree being flushed. + */ +static void bfq_flush_idle_tree(struct bfq_service_tree *st) +{ + struct bfq_entity *entity = st->first_idle; + + for (; entity ; entity = st->first_idle) + __bfq_deactivate_entity(entity, false); +} + +/** + * bfq_reparent_leaf_entity - move leaf entity to the root_group. + * @bfqd: the device data structure with the root group. + * @entity: the entity to move. + */ +static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + BUG_ON(!bfqq); + bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); +} + +/** + * bfq_reparent_active_entities - move to the root group all active + * entities. + * @bfqd: the device data structure with the root group. + * @bfqg: the group to move from. + * @st: the service tree with the entities. + * + * Needs queue_lock to be taken and reference to be valid over the call. + */ +static void bfq_reparent_active_entities(struct bfq_data *bfqd, + struct bfq_group *bfqg, + struct bfq_service_tree *st) +{ + struct rb_root *active = &st->active; + struct bfq_entity *entity = NULL; + + if (!RB_EMPTY_ROOT(&st->active)) + entity = bfq_entity_of(rb_first(active)); + + for (; entity ; entity = bfq_entity_of(rb_first(active))) + bfq_reparent_leaf_entity(bfqd, entity); + + if (bfqg->sched_data.in_service_entity) + bfq_reparent_leaf_entity(bfqd, + bfqg->sched_data.in_service_entity); +} + +/** + * bfq_pd_offline - deactivate the entity associated with @pd, + * and reparent its children entities. + * @pd: descriptor of the policy going offline. + * + * blkio already grabs the queue_lock for us, so no need to use + * RCU-based magic + */ +static void bfq_pd_offline(struct blkg_policy_data *pd) +{ + struct bfq_service_tree *st; + struct bfq_group *bfqg; + struct bfq_data *bfqd; + struct bfq_entity *entity; + int i; + + BUG_ON(!pd); + bfqg = pd_to_bfqg(pd); + BUG_ON(!bfqg); + bfqd = bfqg->bfqd; + BUG_ON(bfqd && !bfqd->root_group); + + entity = bfqg->my_entity; + + if (!entity) /* root group */ + return; + + /* + * Empty all service_trees belonging to this group before + * deactivating the group itself. + */ + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { + BUG_ON(!bfqg->sched_data.service_tree); + st = bfqg->sched_data.service_tree + i; + /* + * The idle tree may still contain bfq_queues belonging + * to exited task because they never migrated to a different + * cgroup from the one being destroyed now. No one else + * can access them so it's safe to act without any lock. + */ + bfq_flush_idle_tree(st); + + /* + * It may happen that some queues are still active + * (busy) upon group destruction (if the corresponding + * processes have been forced to terminate). We move + * all the leaf entities corresponding to these queues + * to the root_group. + * Also, it may happen that the group has an entity + * in service, which is disconnected from the active + * tree: it must be moved, too. + * There is no need to put the sync queues, as the + * scheduler has taken no reference. + */ + bfq_reparent_active_entities(bfqd, bfqg, st); + BUG_ON(!RB_EMPTY_ROOT(&st->active)); + BUG_ON(!RB_EMPTY_ROOT(&st->idle)); + } + BUG_ON(bfqg->sched_data.next_in_service); + BUG_ON(bfqg->sched_data.in_service_entity); + + __bfq_deactivate_entity(entity, false); + bfq_put_async_queues(bfqd, bfqg); + + /* + * @blkg is going offline and will be ignored by + * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so + * that they don't get lost. If IOs complete after this point, the + * stats for them will be lost. Oh well... + */ + bfqg_stats_xfer_dead(bfqg); +} + +static void bfq_end_wr_async(struct bfq_data *bfqd) +{ + struct blkcg_gq *blkg; + + list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { + struct bfq_group *bfqg = blkg_to_bfqg(blkg); + BUG_ON(!bfqg); + + bfq_end_wr_async_queues(bfqd, bfqg); + } + bfq_end_wr_async_queues(bfqd, bfqd->root_group); +} + +static int bfq_io_show_weight(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + unsigned int val = 0; + + if (bfqgd) + val = bfqgd->weight; + + seq_printf(sf, "%u\n", val); + + return 0; +} + +static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 val) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + struct blkcg_gq *blkg; + int ret = -ERANGE; + + if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) + return ret; + + ret = 0; + spin_lock_irq(&blkcg->lock); + bfqgd->weight = (unsigned short)val; + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + struct bfq_group *bfqg = blkg_to_bfqg(blkg); + + if (!bfqg) + continue; + /* + * Setting the prio_changed flag of the entity + * to 1 with new_weight == weight would re-set + * the value of the weight to its ioprio mapping. + * Set the flag only if necessary. + */ + if ((unsigned short)val != bfqg->entity.new_weight) { + bfqg->entity.new_weight = (unsigned short)val; + /* + * Make sure that the above new value has been + * stored in bfqg->entity.new_weight before + * setting the prio_changed flag. In fact, + * this flag may be read asynchronously (in + * critical sections protected by a different + * lock than that held here), and finding this + * flag set may cause the execution of the code + * for updating parameters whose value may + * depend also on bfqg->entity.new_weight (in + * __bfq_entity_update_weight_prio). + * This barrier makes sure that the new value + * of bfqg->entity.new_weight is correctly + * seen in that code. + */ + smp_wmb(); + bfqg->entity.prio_changed = 1; + } + } + spin_unlock_irq(&blkcg->lock); + + return ret; +} + +static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + u64 weight; + /* First unsigned long found in the file is used */ + int ret = kstrtoull(strim(buf), 0, &weight); + + if (ret) + return ret; + + return bfq_io_set_weight_legacy(of_css(of), NULL, weight); +} + +static int bfqg_print_stat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, + &blkcg_policy_bfq, seq_cft(sf)->private, false); + return 0; +} + +static int bfqg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, + &blkcg_policy_bfq, seq_cft(sf)->private, true); + return 0; +} + +static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_bfq, off); + return __blkg_prfill_u64(sf, pd, sum); +} + +static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_bfq, + off); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_stat_recursive, &blkcg_policy_bfq, + seq_cft(sf)->private, false); + return 0; +} + +static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, + seq_cft(sf)->private, true); + return 0; +} + +static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); + return 0; +} + +static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, + false); + return 0; +} + + +static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); + u64 v = 0; + + if (samples) { + v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); + v = div64_u64(v, samples); + } + __blkg_prfill_u64(sf, pd, v); + return 0; +} + +/* print avg_queue_size */ +static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, + 0, false); + return 0; +} + +static struct bfq_group * +bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) +{ + int ret; + + ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); + if (ret) + return NULL; + + return blkg_to_bfqg(bfqd->queue->root_blkg); +} + +static struct cftype bfq_blkcg_legacy_files[] = { + { + .name = "bfq.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write_u64 = bfq_io_set_weight_legacy, + }, + + /* statistics, covers only the tasks in the bfqg */ + { + .name = "bfq.time", + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.sectors", + .seq_show = bfqg_print_stat_sectors, + }, + { + .name = "bfq.io_service_bytes", + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes, + }, + { + .name = "bfq.io_serviced", + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios, + }, + { + .name = "bfq.io_service_time", + .private = offsetof(struct bfq_group, stats.service_time), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_wait_time", + .private = offsetof(struct bfq_group, stats.wait_time), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_merged", + .private = offsetof(struct bfq_group, stats.merged), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_queued", + .private = offsetof(struct bfq_group, stats.queued), + .seq_show = bfqg_print_rwstat, + }, + + /* the same statictics which cover the bfqg and its descendants */ + { + .name = "bfq.time_recursive", + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat_recursive, + }, + { + .name = "bfq.sectors_recursive", + .seq_show = bfqg_print_stat_sectors_recursive, + }, + { + .name = "bfq.io_service_bytes_recursive", + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_bytes_recursive, + }, + { + .name = "bfq.io_serviced_recursive", + .private = (unsigned long)&blkcg_policy_bfq, + .seq_show = blkg_print_stat_ios_recursive, + }, + { + .name = "bfq.io_service_time_recursive", + .private = offsetof(struct bfq_group, stats.service_time), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_wait_time_recursive", + .private = offsetof(struct bfq_group, stats.wait_time), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_merged_recursive", + .private = offsetof(struct bfq_group, stats.merged), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_queued_recursive", + .private = offsetof(struct bfq_group, stats.queued), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.avg_queue_size", + .seq_show = bfqg_print_avg_queue_size, + }, + { + .name = "bfq.group_wait_time", + .private = offsetof(struct bfq_group, stats.group_wait_time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.idle_time", + .private = offsetof(struct bfq_group, stats.idle_time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.empty_time", + .private = offsetof(struct bfq_group, stats.empty_time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.dequeue", + .private = offsetof(struct bfq_group, stats.dequeue), + .seq_show = bfqg_print_stat, + }, + { } /* terminate */ +}; + +static struct cftype bfq_blkg_files[] = { + { + .name = "bfq.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write = bfq_io_set_weight, + }, + {} /* terminate */ +}; + +#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ + +static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, + struct bfq_queue *bfqq, unsigned int op) { } +static inline void +bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } +static inline void +bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } +static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { } +static inline void +bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, + struct bfq_group *curr_bfqg) { } +static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } +static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } +static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } + +static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_group *bfqg) {} + +static void bfq_init_entity(struct bfq_entity *entity, + struct bfq_group *bfqg) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + if (bfqq) { + bfqq->ioprio = bfqq->new_ioprio; + bfqq->ioprio_class = bfqq->new_ioprio_class; + } + entity->sched_data = &bfqg->sched_data; +} + +static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} + +static void bfq_end_wr_async(struct bfq_data *bfqd) +{ + bfq_end_wr_async_queues(bfqd, bfqd->root_group); +} + +static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, + struct blkcg *blkcg) +{ + return bfqd->root_group; +} + +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) +{ + return bfqq->bfqd->root_group; +} + +static struct bfq_group * +bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) +{ + struct bfq_group *bfqg; + int i; + + bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); + if (!bfqg) + return NULL; + + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + + return bfqg; +} +#endif diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c new file mode 100644 index 000000000000..fb7bb8f08b75 --- /dev/null +++ b/block/bfq-ioc.c @@ -0,0 +1,36 @@ +/* + * BFQ: I/O context handling. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + */ + +/** + * icq_to_bic - convert iocontext queue structure to bfq_io_cq. + * @icq: the iocontext queue. + */ +static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) +{ + /* bic->icq is the first member, %NULL will convert to %NULL */ + return container_of(icq, struct bfq_io_cq, icq); +} + +/** + * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. + * @bfqd: the lookup key. + * @ioc: the io_context of the process doing I/O. + * + * Queue lock must be held. + */ +static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + struct io_context *ioc) +{ + if (ioc) + return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); + return NULL; +} diff --git a/block/bfq-sched.c b/block/bfq-sched.c new file mode 100644 index 000000000000..ac8991bca9fa --- /dev/null +++ b/block/bfq-sched.c @@ -0,0 +1,2002 @@ +/* + * BFQ: Hierarchical B-WF2Q+ scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2016 Paolo Valente + */ + +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + +/** + * bfq_gt - compare two timestamps. + * @a: first ts. + * @b: second ts. + * + * Return @a > @b, dealing with wrapping correctly. + */ +static int bfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) +{ + struct rb_node *node = tree->rb_node; + + return rb_entry(node, struct bfq_entity, rb_node); +} + +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); + +static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); + +/** + * bfq_update_next_in_service - update sd->next_in_service + * @sd: sched_data for which to perform the update. + * @new_entity: if not NULL, pointer to the entity whose activation, + * requeueing or repositionig triggered the invocation of + * this function. + * + * This function is called to update sd->next_in_service, which, in + * its turn, may change as a consequence of the insertion or + * extraction of an entity into/from one of the active trees of + * sd. These insertions/extractions occur as a consequence of + * activations/deactivations of entities, with some activations being + * 'true' activations, and other activations being requeueings (i.e., + * implementing the second, requeueing phase of the mechanism used to + * reposition an entity in its active tree; see comments on + * __bfq_activate_entity and __bfq_requeue_entity for details). In + * both the last two activation sub-cases, new_entity points to the + * just activated or requeued entity. + * + * Returns true if sd->next_in_service changes in such a way that + * entity->parent may become the next_in_service for its parent + * entity. + */ +static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *new_entity) +{ + struct bfq_entity *next_in_service = sd->next_in_service; + struct bfq_queue *bfqq; + bool parent_sched_may_change = false; + + /* + * If this update is triggered by the activation, requeueing + * or repositiong of an entity that does not coincide with + * sd->next_in_service, then a full lookup in the active tree + * can be avoided. In fact, it is enough to check whether the + * just-modified entity has a higher priority than + * sd->next_in_service, or, even if it has the same priority + * as sd->next_in_service, is eligible and has a lower virtual + * finish time than sd->next_in_service. If this compound + * condition holds, then the new entity becomes the new + * next_in_service. Otherwise no change is needed. + */ + if (new_entity && new_entity != sd->next_in_service) { + /* + * Flag used to decide whether to replace + * sd->next_in_service with new_entity. Tentatively + * set to true, and left as true if + * sd->next_in_service is NULL. + */ + bool replace_next = true; + + /* + * If there is already a next_in_service candidate + * entity, then compare class priorities or timestamps + * to decide whether to replace sd->service_tree with + * new_entity. + */ + if (next_in_service) { + unsigned int new_entity_class_idx = + bfq_class_idx(new_entity); + struct bfq_service_tree *st = + sd->service_tree + new_entity_class_idx; + + /* + * For efficiency, evaluate the most likely + * sub-condition first. + */ + replace_next = + (new_entity_class_idx == + bfq_class_idx(next_in_service) + && + !bfq_gt(new_entity->start, st->vtime) + && + bfq_gt(next_in_service->finish, + new_entity->finish)) + || + new_entity_class_idx < + bfq_class_idx(next_in_service); + } + + if (replace_next) + next_in_service = new_entity; + } else /* invoked because of a deactivation: lookup needed */ + next_in_service = bfq_lookup_next_entity(sd); + + if (next_in_service) { + parent_sched_may_change = !sd->next_in_service || + bfq_update_parent_budget(next_in_service); + } + + sd->next_in_service = next_in_service; + + if (!next_in_service) + return parent_sched_may_change; + + bfqq = bfq_entity_to_bfqq(next_in_service); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_next_in_service: chosen this queue"); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(next_in_service, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_next_in_service: chosen this entity"); + } +#endif + return parent_sched_may_change; +} + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +/* both next loops stop at one of the child entities of the root group */ +#define for_each_entity(entity) \ + for (; entity ; entity = entity->parent) + +/* + * For each iteration, compute parent in advance, so as to be safe if + * entity is deallocated during the iteration. Such a deallocation may + * happen as a consequence of a bfq_put_queue that frees the bfq_queue + * containing entity. + */ +#define for_each_entity_safe(entity, parent) \ + for (; entity && ({ parent = entity->parent; 1; }); entity = parent) + +/* + * Returns true if this budget changes may let next_in_service->parent + * become the next_in_service entity for its parent entity. + */ +static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) +{ + struct bfq_entity *bfqg_entity; + struct bfq_group *bfqg; + struct bfq_sched_data *group_sd; + bool ret = false; + + BUG_ON(!next_in_service); + + group_sd = next_in_service->sched_data; + + bfqg = container_of(group_sd, struct bfq_group, sched_data); + /* + * bfq_group's my_entity field is not NULL only if the group + * is not the root group. We must not touch the root entity + * as it must never become an in-service entity. + */ + bfqg_entity = bfqg->my_entity; + if (bfqg_entity) { + if (bfqg_entity->budget > next_in_service->budget) + ret = true; + bfqg_entity->budget = next_in_service->budget; + } + + return ret; +} + +/* + * This function tells whether entity stops being a candidate for next + * service, according to the following logic. + * + * This function is invoked for an entity that is about to be set in + * service. If such an entity is a queue, then the entity is no longer + * a candidate for next service (i.e, a candidate entity to serve + * after the in-service entity is expired). The function then returns + * true. + * + * In contrast, the entity could stil be a candidate for next service + * if it is not a queue, and has more than one child. In fact, even if + * one of its children is about to be set in service, other children + * may still be the next to serve. As a consequence, a non-queue + * entity is not a candidate for next-service only if it has only one + * child. And only if this condition holds, then the function returns + * true for a non-queue entity. + */ +static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) +{ + struct bfq_group *bfqg; + + if (bfq_entity_to_bfqq(entity)) + return true; + + bfqg = container_of(entity, struct bfq_group, entity); + + BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); + BUG_ON(bfqg->active_entities == 0); + if (bfqg->active_entities == 1) + return true; + + return false; +} + +#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ +#define for_each_entity(entity) \ + for (; entity ; entity = NULL) + +#define for_each_entity_safe(entity, parent) \ + for (parent = NULL; entity ; entity = parent) + +static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) +{ + return false; +} + +static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) +{ + return true; +} + +#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ + +/* + * Shift for timestamp calculations. This actually limits the maximum + * service allowed in one timestamp delta (small shift values increase it), + * the maximum total weight that can be used for the queues in the system + * (big shift values increase it), and the period of virtual time + * wraparounds. + */ +#define WFQ_SERVICE_SHIFT 22 + +static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = NULL; + + BUG_ON(!entity); + + if (!entity->my_sched_data) + bfqq = container_of(entity, struct bfq_queue, entity); + + return bfqq; +} + + +/** + * bfq_delta - map service into the virtual time domain. + * @service: amount of service. + * @weight: scale factor (weight of an entity or weight sum). + */ +static u64 bfq_delta(unsigned long service, unsigned long weight) +{ + u64 d = (u64)service << WFQ_SERVICE_SHIFT; + + do_div(d, weight); + return d; +} + +/** + * bfq_calc_finish - assign the finish time to an entity. + * @entity: the entity to act upon. + * @service: the service to be charged to the entity. + */ +static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned long long start, finish, delta; + + BUG_ON(entity->weight == 0); + + entity->finish = entity->start + + bfq_delta(service, entity->weight); + + start = ((entity->start>>10)*1000)>>12; + finish = ((entity->finish>>10)*1000)>>12; + delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: start %llu, finish %llu, delta %llu", + start, finish, delta); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_finish group: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_finish group: start %llu, finish %llu, delta %llu", + start, finish, delta); +#endif + } +} + +/** + * bfq_entity_of - get an entity from a node. + * @node: the node field of the entity. + * + * Convert a node pointer to the relative entity. This is used only + * to simplify the logic of some functions and not as the generic + * conversion mechanism because, e.g., in the tree walking functions, + * the check for a %NULL value would be redundant. + */ +static struct bfq_entity *bfq_entity_of(struct rb_node *node) +{ + struct bfq_entity *entity = NULL; + + if (node) + entity = rb_entry(node, struct bfq_entity, rb_node); + + return entity; +} + +/** + * bfq_extract - remove an entity from a tree. + * @root: the tree root. + * @entity: the entity to remove. + */ +static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) +{ + BUG_ON(entity->tree != root); + + entity->tree = NULL; + rb_erase(&entity->rb_node, root); +} + +/** + * bfq_idle_extract - extract an entity from the idle tree. + * @st: the service tree of the owning @entity. + * @entity: the entity being removed. + */ +static void bfq_idle_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *next; + + BUG_ON(entity->tree != &st->idle); + + if (entity == st->first_idle) { + next = rb_next(&entity->rb_node); + st->first_idle = bfq_entity_of(next); + } + + if (entity == st->last_idle) { + next = rb_prev(&entity->rb_node); + st->last_idle = bfq_entity_of(next); + } + + bfq_extract(&st->idle, entity); + + if (bfqq) + list_del(&bfqq->bfqq_list); +} + +/** + * bfq_insert - generic tree insertion. + * @root: tree root. + * @entity: entity to insert. + * + * This is used for the idle and the active tree, since they are both + * ordered by finish time. + */ +static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) +{ + struct bfq_entity *entry; + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + + BUG_ON(entity->tree); + + while (*node) { + parent = *node; + entry = rb_entry(parent, struct bfq_entity, rb_node); + + if (bfq_gt(entry->finish, entity->finish)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&entity->rb_node, parent, node); + rb_insert_color(&entity->rb_node, root); + + entity->tree = root; +} + +/** + * bfq_update_min - update the min_start field of a entity. + * @entity: the entity to update. + * @node: one of its children. + * + * This function is called when @entity may store an invalid value for + * min_start due to updates to the active tree. The function assumes + * that the subtree rooted at @node (which may be its left or its right + * child) has a valid min_start value. + */ +static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) +{ + struct bfq_entity *child; + + if (node) { + child = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entity->min_start, child->min_start)) + entity->min_start = child->min_start; + } +} + +/** + * bfq_update_active_node - recalculate min_start. + * @node: the node to update. + * + * @node may have changed position or one of its children may have moved, + * this function updates its min_start value. The left and right subtrees + * are assumed to hold a correct min_start value. + */ +static void bfq_update_active_node(struct rb_node *node) +{ + struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + entity->min_start = entity->start; + bfq_update_min(entity, node->rb_right); + bfq_update_min(entity, node->rb_left); + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "update_active_node: new min_start %llu", + ((entity->min_start>>10)*1000)>>12); +#endif + } +} + +/** + * bfq_update_active_tree - update min_start for the whole active tree. + * @node: the starting node. + * + * @node must be the deepest modified node after an update. This function + * updates its min_start using the values held by its children, assuming + * that they did not change, and then updates all the nodes that may have + * changed in the path to the root. The only nodes that may have changed + * are the ones in the path or their siblings. + */ +static void bfq_update_active_tree(struct rb_node *node) +{ + struct rb_node *parent; + +up: + bfq_update_active_node(node); + + parent = rb_parent(node); + if (!parent) + return; + + if (node == parent->rb_left && parent->rb_right) + bfq_update_active_node(parent->rb_right); + else if (parent->rb_left) + bfq_update_active_node(parent->rb_left); + + node = parent; + goto up; +} + +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); + +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root); + + +/** + * bfq_active_insert - insert an entity in the active tree of its + * group/device. + * @st: the service tree of the entity. + * @entity: the entity being inserted. + * + * The active tree is ordered by finish time, but an extra key is kept + * per each node, containing the minimum value for the start times of + * its children (and the node itself), so it's possible to search for + * the eligible node with the lowest finish time in logarithmic time. + */ +static void bfq_active_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node = &entity->rb_node; +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + bfq_insert(&st->active, entity); + + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + + bfq_update_active_tree(node); + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); + } + if (bfqg != bfqd->root_group) { + BUG_ON(!bfqg); + BUG_ON(!bfqd); + bfqg->active_entities++; + } +#endif +} + +/** + * bfq_ioprio_to_weight - calc a weight from an ioprio. + * @ioprio: the ioprio value to convert. + */ +static unsigned short bfq_ioprio_to_weight(int ioprio) +{ + BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; +} + +/** + * bfq_weight_to_ioprio - calc an ioprio from a weight. + * @weight: the weight value to convert. + * + * To preserve as much as possible the old only-ioprio user interface, + * 0 is used as an escape ioprio value for weights (numerically) equal or + * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. + */ +static unsigned short bfq_weight_to_ioprio(int weight) +{ + BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); + return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? + 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; +} + +static void bfq_get_entity(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + if (bfqq) { + bfqq->ref++; + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", + bfqq, bfqq->ref); + } +} + +/** + * bfq_find_deepest - find the deepest node that an extraction can modify. + * @node: the node being removed. + * + * Do the first step of an extraction in an rb tree, looking for the + * node that will replace @node, and returning the deepest node that + * the following modifications to the tree can touch. If @node is the + * last node in the tree return %NULL. + */ +static struct rb_node *bfq_find_deepest(struct rb_node *node) +{ + struct rb_node *deepest; + + if (!node->rb_right && !node->rb_left) + deepest = rb_parent(node); + else if (!node->rb_right) + deepest = node->rb_left; + else if (!node->rb_left) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); + } + + return deepest; +} + +/** + * bfq_active_extract - remove an entity from the active tree. + * @st: the service_tree containing the tree. + * @entity: the entity being removed. + */ +static void bfq_active_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node; +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + node = bfq_find_deepest(&entity->rb_node); + bfq_extract(&st->active, entity); + + if (node) + bfq_update_active_tree(node); + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq) + list_del(&bfqq->bfqq_list); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + else { /* bfq_group */ + BUG_ON(!bfqd); + bfq_weights_tree_remove(bfqd, entity, + &bfqd->group_weights_tree); + } + if (bfqg != bfqd->root_group) { + BUG_ON(!bfqg); + BUG_ON(!bfqd); + BUG_ON(!bfqg->active_entities); + bfqg->active_entities--; + } +#endif +} + +/** + * bfq_idle_insert - insert an entity into the idle tree. + * @st: the service tree containing the tree. + * @entity: the entity to insert. + */ +static void bfq_idle_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) + st->first_idle = entity; + if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) + st->last_idle = entity; + + bfq_insert(&st->idle, entity); + + if (bfqq) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); +} + +/** + * bfq_forget_entity - do not consider entity any longer for scheduling + * @st: the service tree. + * @entity: the entity being removed. + * @is_in_service: true if entity is currently the in-service entity. + * + * Forget everything about @entity. In addition, if entity represents + * a queue, and the latter is not in service, then release the service + * reference to the queue (the one taken through bfq_get_entity). In + * fact, in this case, there is really no more service reference to + * the queue, as the latter is also outside any service tree. If, + * instead, the queue is in service, then __bfq_bfqd_reset_in_service + * will take care of putting the reference when the queue finally + * stops being served. + */ +static void bfq_forget_entity(struct bfq_service_tree *st, + struct bfq_entity *entity, + bool is_in_service) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + BUG_ON(!entity->on_st); + + entity->on_st = false; + st->wsum -= entity->weight; + if (bfqq && !is_in_service) { + bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity (before): %p %d", + bfqq, bfqq->ref); + bfq_put_queue(bfqq); + } +} + +/** + * bfq_put_idle_entity - release the idle tree ref of an entity. + * @st: service tree for the entity. + * @entity: the entity being released. + */ +static void bfq_put_idle_entity(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + bfq_idle_extract(st, entity); + bfq_forget_entity(st, entity, + entity == entity->sched_data->in_service_entity); +} + +/** + * bfq_forget_idle - update the idle tree if necessary. + * @st: the service tree to act upon. + * + * To preserve the global O(log N) complexity we only remove one entry here; + * as the idle tree will not grow indefinitely this can be done safely. + */ +static void bfq_forget_idle(struct bfq_service_tree *st) +{ + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (RB_EMPTY_ROOT(&st->active) && last_idle && + !bfq_gt(last_idle->finish, st->vtime)) { + /* + * Forget the whole idle tree, increasing the vtime past + * the last finish time of idle entities. + */ + st->vtime = last_idle->finish; + } + + if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) + bfq_put_idle_entity(st, first_idle); +} + +/* + * Update weight and priority of entity. If update_class_too is true, + * then update the ioprio_class of entity too. + * + * The reason why the update of ioprio_class is controlled through the + * last parameter is as follows. Changing the ioprio class of an + * entity implies changing the destination service trees for that + * entity. If such a change occurred when the entity is already on one + * of the service trees for its previous class, then the state of the + * entity would become more complex: none of the new possible service + * trees for the entity, according to bfq_entity_service_tree(), would + * match any of the possible service trees on which the entity + * is. Complex operations involving these trees, such as entity + * activations and deactivations, should take into account this + * additional complexity. To avoid this issue, this function is + * invoked with update_class_too unset in the points in the code where + * entity may happen to be on some tree. + */ +static struct bfq_service_tree * +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + struct bfq_entity *entity, + bool update_class_too) +{ + struct bfq_service_tree *new_st = old_st; + + if (entity->prio_changed) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned int prev_weight, new_weight; + struct bfq_data *bfqd = NULL; + struct rb_root *root; +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + struct bfq_sched_data *sd; + struct bfq_group *bfqg; +#endif + + if (bfqq) + bfqd = bfqq->bfqd; +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + else { + sd = entity->my_sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + BUG_ON(!bfqg); + bfqd = (struct bfq_data *)bfqg->bfqd; + BUG_ON(!bfqd); + } +#endif + + BUG_ON(old_st->wsum < entity->weight); + old_st->wsum -= entity->weight; + + if (entity->new_weight != entity->orig_weight) { + if (entity->new_weight < BFQ_MIN_WEIGHT || + entity->new_weight > BFQ_MAX_WEIGHT) { + pr_crit("update_weight_prio: new_weight %d\n", + entity->new_weight); + if (entity->new_weight < BFQ_MIN_WEIGHT) + entity->new_weight = BFQ_MIN_WEIGHT; + else + entity->new_weight = BFQ_MAX_WEIGHT; + } + entity->orig_weight = entity->new_weight; + if (bfqq) + bfqq->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); + } + + if (bfqq && update_class_too) + bfqq->ioprio_class = bfqq->new_ioprio_class; + + /* + * Reset prio_changed only if the ioprio_class change + * is not pending any longer. + */ + if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) + entity->prio_changed = 0; + + /* + * NOTE: here we may be changing the weight too early, + * this will cause unfairness. The correct approach + * would have required additional complexity to defer + * weight changes to the proper time instants (i.e., + * when entity->finish <= old_st->vtime). + */ + new_st = bfq_entity_service_tree(entity); + + prev_weight = entity->weight; + new_weight = entity->orig_weight * + (bfqq ? bfqq->wr_coeff : 1); + /* + * If the weight of the entity changes, remove the entity + * from its old weight counter (if there is a counter + * associated with the entity), and add it to the counter + * associated with its new weight. + */ + if (prev_weight != new_weight) { + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "weight changed %d %d(%d %d)", + prev_weight, new_weight, + entity->orig_weight, + bfqq->wr_coeff); + + root = bfqq ? &bfqd->queue_weights_tree : + &bfqd->group_weights_tree; + bfq_weights_tree_remove(bfqd, entity, root); + } + entity->weight = new_weight; + /* + * Add the entity to its weights tree only if it is + * not associated with a weight-raised queue. + */ + if (prev_weight != new_weight && + (bfqq ? bfqq->wr_coeff == 1 : 1)) + /* If we get here, root has been initialized. */ + bfq_weights_tree_add(bfqd, entity, root); + + new_st->wsum += entity->weight; + + if (new_st != old_st) + entity->start = new_st->vtime; + } + + return new_st; +} + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); +#endif + +/** + * bfq_bfqq_served - update the scheduler status after selection for + * service. + * @bfqq: the queue being served. + * @served: bytes to transfer. + * + * NOTE: this can be optimized, as the timestamps of upper level entities + * are synchronized every time a new bfqq is selected for service. By now, + * we keep it to better check consistency. + */ +static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st; + + for_each_entity(entity) { + st = bfq_entity_service_tree(entity); + + entity->service += served; + + BUG_ON(st->wsum == 0); + + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); +#endif + st = bfq_entity_service_tree(&bfqq->entity); + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", + served, ((st->vtime>>10)*1000)>>12, st); +} + +/** + * bfq_bfqq_charge_time - charge an amount of service equivalent to the length + * of the time interval during which bfqq has been in + * service. + * @bfqd: the device + * @bfqq: the queue that needs a service update. + * @time_ms: the amount of time during which the queue has received service + * + * If a queue does not consume its budget fast enough, then providing + * the queue with service fairness may impair throughput, more or less + * severely. For this reason, queues that consume their budget slowly + * are provided with time fairness instead of service fairness. This + * goal is achieved through the BFQ scheduling engine, even if such an + * engine works in the service, and not in the time domain. The trick + * is charging these queues with an inflated amount of service, equal + * to the amount of service that they would have received during their + * service slot if they had been fast, i.e., if their requests had + * been dispatched at a rate equal to the estimated peak rate. + * + * It is worth noting that time fairness can cause important + * distortions in terms of bandwidth distribution, on devices with + * internal queueing. The reason is that I/O requests dispatched + * during the service slot of a queue may be served after that service + * slot is finished, and may have a total processing time loosely + * correlated with the duration of the service slot. This is + * especially true for short service slots. + */ +static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, + unsigned long time_ms) +{ + struct bfq_entity *entity = &bfqq->entity; + int tot_serv_to_charge = entity->service; + unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); + + if (time_ms > 0 && time_ms < timeout_ms) + tot_serv_to_charge = + (bfqd->bfq_max_budget * time_ms) / timeout_ms; + + if (tot_serv_to_charge < entity->service) + tot_serv_to_charge = entity->service; + + bfq_log_bfqq(bfqq->bfqd, bfqq, + "charge_time: %lu/%u ms, %d/%d/%d sectors", + time_ms, timeout_ms, entity->service, + tot_serv_to_charge, entity->budget); + + /* Increase budget to avoid inconsistencies */ + if (tot_serv_to_charge > entity->budget) + entity->budget = tot_serv_to_charge; + + bfq_bfqq_served(bfqq, + max_t(int, 0, tot_serv_to_charge - entity->service)); +} + +static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + struct bfq_service_tree *st, + bool backshifted) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_sched_data *sd = entity->sched_data; + + /* + * When this function is invoked, entity is not in any service + * tree, then it is safe to invoke next function with the last + * parameter set (see the comments on the function). + */ + st = __bfq_entity_update_weight_prio(st, entity, true); + bfq_calc_finish(entity, entity->budget); + + /* + * If some queues enjoy backshifting for a while, then their + * (virtual) finish timestamps may happen to become lower and + * lower than the system virtual time. In particular, if + * these queues often happen to be idle for short time + * periods, and during such time periods other queues with + * higher timestamps happen to be busy, then the backshifted + * timestamps of the former queues can become much lower than + * the system virtual time. In fact, to serve the queues with + * higher timestamps while the ones with lower timestamps are + * idle, the system virtual time may be pushed-up to much + * higher values than the finish timestamps of the idle + * queues. As a consequence, the finish timestamps of all new + * or newly activated queues may end up being much larger than + * those of lucky queues with backshifted timestamps. The + * latter queues may then monopolize the device for a lot of + * time. This would simply break service guarantees. + * + * To reduce this problem, push up a little bit the + * backshifted timestamps of the queue associated with this + * entity (only a queue can happen to have the backshifted + * flag set): just enough to let the finish timestamp of the + * queue be equal to the current value of the system virtual + * time. This may introduce a little unfairness among queues + * with backshifted timestamps, but it does not break + * worst-case fairness guarantees. + * + * As a special case, if bfqq is weight-raised, push up + * timestamps much less, to keep very low the probability that + * this push up causes the backshifted finish timestamps of + * weight-raised queues to become higher than the backshifted + * finish timestamps of non weight-raised queues. + */ + if (backshifted && bfq_gt(st->vtime, entity->finish)) { + unsigned long delta = st->vtime - entity->finish; + + if (bfqq) + delta /= bfqq->wr_coeff; + + entity->start += delta; + entity->finish += delta; + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__activate_entity: new queue finish %llu", + ((entity->finish>>10)*1000)>>12); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__activate_entity: new group finish %llu", + ((entity->finish>>10)*1000)>>12); +#endif + } + } + + bfq_active_insert(st, entity); + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__activate_entity: queue %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + } else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__activate_entity: group %seligible in st %p", + entity->start <= st->vtime ? "" : "non ", st); +#endif + } + BUG_ON(RB_EMPTY_ROOT(&st->active)); + BUG_ON(&st->active != &sd->service_tree->active && + &st->active != &(sd->service_tree+1)->active && + &st->active != &(sd->service_tree+2)->active); +} + +/** + * __bfq_activate_entity - handle activation of entity. + * @entity: the entity being activated. + * @non_blocking_wait_rq: true if entity was waiting for a request + * + * Called for a 'true' activation, i.e., if entity is not active and + * one of its children receives a new request. + * + * Basically, this function updates the timestamps of entity and + * inserts entity into its active tree, ater possible extracting it + * from its idle tree. + */ +static void __bfq_activate_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + bool backshifted = false; + unsigned long long min_vstart; + + BUG_ON(!sd); + BUG_ON(!st); + + /* See comments on bfq_fqq_update_budg_for_activation */ + if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { + backshifted = true; + min_vstart = entity->finish; + } else + min_vstart = st->vtime; + + if (entity->tree == &st->idle) { + /* + * Must be on the idle tree, bfq_idle_extract() will + * check for that. + */ + bfq_idle_extract(st, entity); + entity->start = bfq_gt(min_vstart, entity->finish) ? + min_vstart : entity->finish; + } else { + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue + * would have been on the idle tree. + */ + entity->start = min_vstart; + st->wsum += entity->weight; + /* + * entity is about to be inserted into a service tree, + * and then set in service: get a reference to make + * sure entity does not disappear until it is no + * longer in service or scheduled for service. + */ + bfq_get_entity(entity); + + BUG_ON(entity->on_st && bfqq); + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + if (entity->on_st && !bfqq) { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, + entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, + bfqg, + "activate bug, class %d in_service %p", + bfq_class_idx(entity), sd->in_service_entity); + } +#endif + BUG_ON(entity->on_st && !bfqq); + entity->on_st = true; + } + + bfq_update_fin_time_enqueue(entity, st, backshifted); +} + +/** + * __bfq_requeue_entity - handle requeueing or repositioning of an entity. + * @entity: the entity being requeued or repositioned. + * + * Requeueing is needed if this entity stops being served, which + * happens if a leaf descendant entity has expired. On the other hand, + * repositioning is needed if the next_inservice_entity for the child + * entity has changed. See the comments inside the function for + * details. + * + * Basically, this function: 1) removes entity from its active tree if + * present there, 2) updates the timestamps of entity and 3) inserts + * entity back into its active tree (in the new, right position for + * the new values of the timestamps). + */ +static void __bfq_requeue_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + BUG_ON(!sd); + BUG_ON(!st); + + BUG_ON(entity != sd->in_service_entity && + entity->tree != &st->active); + + if (entity == sd->in_service_entity) { + /* + * We are requeueing the current in-service entity, + * which may have to be done for one of the following + * reasons: + * - entity represents the in-service queue, and the + * in-service queue is being requeued after an + * expiration; + * - entity represents a group, and its budget has + * changed because one of its child entities has + * just been either activated or requeued for some + * reason; the timestamps of the entity need then to + * be updated, and the entity needs to be enqueued + * or repositioned accordingly. + * + * In particular, before requeueing, the start time of + * the entity must be moved forward to account for the + * service that the entity has received while in + * service. This is done by the next instructions. The + * finish time will then be updated according to this + * new value of the start time, and to the budget of + * the entity. + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; + BUG_ON(entity->tree && entity->tree != &st->active); + /* + * In addition, if the entity had more than one child + * when set in service, then was not extracted from + * the active tree. This implies that the position of + * the entity in the active tree may need to be + * changed now, because we have just updated the start + * time of the entity, and we will update its finish + * time in a moment (the requeueing is then, more + * precisely, a repositioning in this case). To + * implement this repositioning, we: 1) dequeue the + * entity here, 2) update the finish time and + * requeue the entity according to the new + * timestamps below. + */ + if (entity->tree) + bfq_active_extract(st, entity); + } else { /* The entity is already active, and not in service */ + /* + * In this case, this function gets called only if the + * next_in_service entity below this entity has + * changed, and this change has caused the budget of + * this entity to change, which, finally implies that + * the finish time of this entity must be + * updated. Such an update may cause the scheduling, + * i.e., the position in the active tree, of this + * entity to change. We handle this change by: 1) + * dequeueing the entity here, 2) updating the finish + * time and requeueing the entity according to the new + * timestamps below. This is the same approach as the + * non-extracted-entity sub-case above. + */ + bfq_active_extract(st, entity); + } + + bfq_update_fin_time_enqueue(entity, st, false); +} + +static void __bfq_activate_requeue_entity(struct bfq_entity *entity, + struct bfq_sched_data *sd, + bool non_blocking_wait_rq) +{ + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + if (sd->in_service_entity == entity || entity->tree == &st->active) + /* + * in service or already queued on the active tree, + * requeue or reposition + */ + __bfq_requeue_entity(entity); + else + /* + * Not in service and not queued on its active tree: + * the activity is idle and this is a true activation. + */ + __bfq_activate_entity(entity, non_blocking_wait_rq); +} + + +/** + * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, + * and activate, requeue or reposition all ancestors + * for which such an update becomes necessary. + * @entity: the entity to activate. + * @non_blocking_wait_rq: true if this entity was waiting for a request + * @requeue: true if this is a requeue, which implies that bfqq is + * being expired; thus ALL its ancestors stop being served and must + * therefore be requeued + */ +static void bfq_activate_requeue_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq, + bool requeue) +{ + struct bfq_sched_data *sd; + + for_each_entity(entity) { + BUG_ON(!entity); + sd = entity->sched_data; + __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); + + BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && + RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && + RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); + + if (!bfq_update_next_in_service(sd, entity) && !requeue) { + BUG_ON(!sd->next_in_service); + break; + } + BUG_ON(!sd->next_in_service); + } +} + +/** + * __bfq_deactivate_entity - deactivate an entity from its service tree. + * @entity: the entity to deactivate. + * @ins_into_idle_tree: if false, the entity will not be put into the + * idle tree. + * + * Deactivates an entity, independently from its previous state. Must + * be invoked only if entity is on a service tree. Extracts the entity + * from that tree, and if necessary and allowed, puts it on the idle + * tree. + */ +static bool __bfq_deactivate_entity(struct bfq_entity *entity, + bool ins_into_idle_tree) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st; + bool is_in_service; + + if (!entity->on_st) { /* entity never activated, or already inactive */ + BUG_ON(sd && entity == sd->in_service_entity); + return false; + } + + /* + * If we get here, then entity is active, which implies that + * bfq_group_set_parent has already been invoked for the group + * represented by entity. Therefore, the field + * entity->sched_data has been set, and we can safely use it. + */ + st = bfq_entity_service_tree(entity); + is_in_service = entity == sd->in_service_entity; + + BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); + + if (is_in_service) + bfq_calc_finish(entity, entity->service); + + if (entity->tree == &st->active) + bfq_active_extract(st, entity); + else if (!is_in_service && entity->tree == &st->idle) + bfq_idle_extract(st, entity); + else if (entity->tree) + BUG(); + + if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) + bfq_forget_entity(st, entity, is_in_service); + else + bfq_idle_insert(st, entity); + + return true; +} + +/** + * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. + * @entity: the entity to deactivate. + * @ins_into_idle_tree: true if the entity can be put on the idle tree + */ +static void bfq_deactivate_entity(struct bfq_entity *entity, + bool ins_into_idle_tree, + bool expiration) +{ + struct bfq_sched_data *sd; + struct bfq_entity *parent = NULL; + + for_each_entity_safe(entity, parent) { + sd = entity->sched_data; + + BUG_ON(sd == NULL); /* + * It would mean that this is the + * root group. + */ + + BUG_ON(expiration && entity != sd->in_service_entity); + + BUG_ON(entity != sd->in_service_entity && + entity->tree == + &bfq_entity_service_tree(entity)->active && + !sd->next_in_service); + + if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { + /* + * entity is not in any tree any more, so + * this deactivation is a no-op, and there is + * nothing to change for upper-level entities + * (in case of expiration, this can never + * happen). + */ + BUG_ON(expiration); /* + * entity cannot be already out of + * any tree + */ + return; + } + + if (sd->next_in_service == entity) + /* + * entity was the next_in_service entity, + * then, since entity has just been + * deactivated, a new one must be found. + */ + bfq_update_next_in_service(sd, NULL); + + if (sd->next_in_service) { + /* + * The parent entity is still backlogged, + * because next_in_service is not NULL. So, no + * further upwards deactivation must be + * performed. Yet, next_in_service has + * changed. Then the schedule does need to be + * updated upwards. + */ + BUG_ON(sd->next_in_service == entity); + break; + } + + /* + * If we get here, then the parent is no more + * backlogged and we need to propagate the + * deactivation upwards. Thus let the loop go on. + */ + + /* + * Also let parent be queued into the idle tree on + * deactivation, to preserve service guarantees, and + * assuming that who invoked this function does not + * need parent entities too to be removed completely. + */ + ins_into_idle_tree = true; + } + + /* + * If the deactivation loop is fully executed, then there are + * no more entities to touch and next loop is not executed at + * all. Otherwise, requeue remaining entities if they are + * about to stop receiving service, or reposition them if this + * is not the case. + */ + entity = parent; + for_each_entity(entity) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + /* + * Invoke __bfq_requeue_entity on entity, even if + * already active, to requeue/reposition it in the + * active tree (because sd->next_in_service has + * changed) + */ + __bfq_requeue_entity(entity); + + sd = entity->sched_data; + BUG_ON(expiration && sd->in_service_entity != entity); + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "invoking udpdate_next for this queue"); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "invoking udpdate_next for this entity"); + } +#endif + if (!bfq_update_next_in_service(sd, entity) && + !expiration) + /* + * next_in_service unchanged or not causing + * any change in entity->parent->sd, and no + * requeueing needed for expiration: stop + * here. + */ + break; + } +} + +/** + * bfq_calc_vtime_jump - compute the value to which the vtime should jump, + * if needed, to have at least one entity eligible. + * @st: the service tree to act upon. + * + * Assumes that st is not empty. + */ +static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) +{ + struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); + + if (bfq_gt(root_entity->min_start, st->vtime)) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_vtime_jump: new value %llu", + root_entity->min_start); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(root_entity, struct bfq_group, + entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "calc_vtime_jump: new value %llu", + root_entity->min_start); + } +#endif + return root_entity->min_start; + } + return st->vtime; +} + +static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) +{ + if (new_value > st->vtime) { + st->vtime = new_value; + bfq_forget_idle(st); + } +} + +/** + * bfq_first_active_entity - find the eligible entity with + * the smallest finish time + * @st: the service tree to select from. + * @vtime: the system virtual to use as a reference for eligibility + * + * This function searches the first schedulable entity, starting from the + * root of the tree and going on the left every time on this side there is + * a subtree with at least one eligible (start >= vtime) entity. The path on + * the right is followed only if a) the left subtree contains no eligible + * entities and b) no eligible entity has been found yet. + */ +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, + u64 vtime) +{ + struct bfq_entity *entry, *first = NULL; + struct rb_node *node = st->active.rb_node; + + while (node) { + entry = rb_entry(node, struct bfq_entity, rb_node); +left: + if (!bfq_gt(entry->start, vtime)) + first = entry; + + BUG_ON(bfq_gt(entry->min_start, vtime)); + + if (node->rb_left) { + entry = rb_entry(node->rb_left, + struct bfq_entity, rb_node); + if (!bfq_gt(entry->min_start, vtime)) { + node = node->rb_left; + goto left; + } + } + if (first) + break; + node = node->rb_right; + } + + BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); + return first; +} + +/** + * __bfq_lookup_next_entity - return the first eligible entity in @st. + * @st: the service tree. + * + * If there is no in-service entity for the sched_data st belongs to, + * then return the entity that will be set in service if: + * 1) the parent entity this st belongs to is set in service; + * 2) no entity belonging to such parent entity undergoes a state change + * that would influence the timestamps of the entity (e.g., becomes idle, + * becomes backlogged, changes its budget, ...). + * + * In this first case, update the virtual time in @st too (see the + * comments on this update inside the function). + * + * In constrast, if there is an in-service entity, then return the + * entity that would be set in service if not only the above + * conditions, but also the next one held true: the currently + * in-service entity, on expiration, + * 1) gets a finish time equal to the current one, or + * 2) is not eligible any more, or + * 3) is idle. + */ +static struct bfq_entity * +__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service +#if 0 + , bool force +#endif + ) +{ + struct bfq_entity *entity +#if 0 + , *new_next_in_service = NULL +#endif + ; + u64 new_vtime; + struct bfq_queue *bfqq; + + if (RB_EMPTY_ROOT(&st->active)) + return NULL; + + /* + * Get the value of the system virtual time for which at + * least one entity is eligible. + */ + new_vtime = bfq_calc_vtime_jump(st); + + /* + * If there is no in-service entity for the sched_data this + * active tree belongs to, then push the system virtual time + * up to the value that guarantees that at least one entity is + * eligible. If, instead, there is an in-service entity, then + * do not make any such update, because there is already an + * eligible entity, namely the in-service one (even if the + * entity is not on st, because it was extracted when set in + * service). + */ + if (!in_service) + bfq_update_vtime(st, new_vtime); + + entity = bfq_first_active_entity(st, new_vtime); + BUG_ON(bfq_gt(entity->start, new_vtime)); + + /* Log some information */ + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "__lookup_next: start %llu vtime %llu st %p", + ((entity->start>>10)*1000)>>12, + ((new_vtime>>10)*1000)>>12, st); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "__lookup_next: start %llu vtime %llu st %p", + ((entity->start>>10)*1000)>>12, + ((new_vtime>>10)*1000)>>12, st); + } +#endif + + BUG_ON(!entity); + + return entity; +} + +/** + * bfq_lookup_next_entity - return the first eligible entity in @sd. + * @sd: the sched_data. + * + * This function is invoked when there has been a change in the trees + * for sd, and we need know what is the new next entity after this + * change. + */ +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) +{ + struct bfq_service_tree *st = sd->service_tree; + struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); + struct bfq_entity *entity = NULL; + struct bfq_queue *bfqq; + int class_idx = 0; + + BUG_ON(!sd); + BUG_ON(!st); + /* + * Choose from idle class, if needed to guarantee a minimum + * bandwidth to this class (and if there is some active entity + * in idle class). This should also mitigate + * priority-inversion problems in case a low priority task is + * holding file system resources. + */ + if (time_is_before_jiffies(sd->bfq_class_idle_last_service + + BFQ_CL_IDLE_TIMEOUT)) { + if (!RB_EMPTY_ROOT(&idle_class_st->active)) + class_idx = BFQ_IOPRIO_CLASSES - 1; + /* About to be served if backlogged, or not yet backlogged */ + sd->bfq_class_idle_last_service = jiffies; + } + + /* + * Find the next entity to serve for the highest-priority + * class, unless the idle class needs to be served. + */ + for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { + entity = __bfq_lookup_next_entity(st + class_idx, + sd->in_service_entity); + + if (entity) + break; + } + + BUG_ON(!entity && + (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || + !RB_EMPTY_ROOT(&(st+2)->active))); + + if (!entity) + return NULL; + + /* Log some information */ + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", + st + class_idx, class_idx); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "chosen from st %p %d", + st + class_idx, class_idx); + } +#endif + + return entity; +} + +static bool next_queue_may_preempt(struct bfq_data *bfqd) +{ + struct bfq_sched_data *sd = &bfqd->root_group->sched_data; + + return sd->next_in_service != sd->in_service_entity; +} + +/* + * Get next queue for service. + */ +static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) +{ + struct bfq_entity *entity = NULL; + struct bfq_sched_data *sd; + struct bfq_queue *bfqq; + + BUG_ON(bfqd->in_service_queue); + + if (bfqd->busy_queues == 0) + return NULL; + + /* + * Traverse the path from the root to the leaf entity to + * serve. Set in service all the entities visited along the + * way. + */ + sd = &bfqd->root_group->sched_data; + for (; sd ; sd = entity->my_sched_data) { +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + if (entity) { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "get_next_queue: lookup in this group"); + if (!sd->next_in_service) + pr_crit("get_next_queue: lookup in this group"); + } else { + bfq_log_bfqg(bfqd, bfqd->root_group, + "get_next_queue: lookup in root group"); + if (!sd->next_in_service) + pr_crit("get_next_queue: lookup in root group"); + } +#endif + + BUG_ON(!sd->next_in_service); + + /* + * WARNING. We are about to set the in-service entity + * to sd->next_in_service, i.e., to the (cached) value + * returned by bfq_lookup_next_entity(sd) the last + * time it was invoked, i.e., the last time when the + * service order in sd changed as a consequence of the + * activation or deactivation of an entity. In this + * respect, if we execute bfq_lookup_next_entity(sd) + * in this very moment, it may, although with low + * probability, yield a different entity than that + * pointed to by sd->next_in_service. This rare event + * happens in case there was no CLASS_IDLE entity to + * serve for sd when bfq_lookup_next_entity(sd) was + * invoked for the last time, while there is now one + * such entity. + * + * If the above event happens, then the scheduling of + * such entity in CLASS_IDLE is postponed until the + * service of the sd->next_in_service entity + * finishes. In fact, when the latter is expired, + * bfq_lookup_next_entity(sd) gets called again, + * exactly to update sd->next_in_service. + */ + + /* Make next_in_service entity become in_service_entity */ + entity = sd->next_in_service; + sd->in_service_entity = entity; + + /* + * Reset the accumulator of the amount of service that + * the entity is about to receive. + */ + entity->service = 0; + + /* + * If entity is no longer a candidate for next + * service, then we extract it from its active tree, + * for the following reason. To further boost the + * throughput in some special case, BFQ needs to know + * which is the next candidate entity to serve, while + * there is already an entity in service. In this + * respect, to make it easy to compute/update the next + * candidate entity to serve after the current + * candidate has been set in service, there is a case + * where it is necessary to extract the current + * candidate from its service tree. Such a case is + * when the entity just set in service cannot be also + * a candidate for next service. Details about when + * this conditions holds are reported in the comments + * on the function bfq_no_longer_next_in_service() + * invoked below. + */ + if (bfq_no_longer_next_in_service(entity)) + bfq_active_extract(bfq_entity_service_tree(entity), + entity); + + /* + * For the same reason why we may have just extracted + * entity from its active tree, we may need to update + * next_in_service for the sched_data of entity too, + * regardless of whether entity has been extracted. + * In fact, even if entity has not been extracted, a + * descendant entity may get extracted. Such an event + * would cause a change in next_in_service for the + * level of the descendant entity, and thus possibly + * back to upper levels. + * + * We cannot perform the resulting needed update + * before the end of this loop, because, to know which + * is the correct next-to-serve candidate entity for + * each level, we need first to find the leaf entity + * to set in service. In fact, only after we know + * which is the next-to-serve leaf entity, we can + * discover whether the parent entity of the leaf + * entity becomes the next-to-serve, and so on. + */ + + /* Log some information */ + bfqq = bfq_entity_to_bfqq(entity); + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "get_next_queue: this queue, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg(bfqd, bfqg, + "get_next_queue: this entity, finish %llu", + (((entity->finish>>10)*1000)>>10)>>2); + } +#endif + + } + + BUG_ON(!entity); + bfqq = bfq_entity_to_bfqq(entity); + BUG_ON(!bfqq); + + /* + * We can finally update all next-to-serve entities along the + * path from the leaf entity just set in service to the root. + */ + for_each_entity(entity) { + struct bfq_sched_data *sd = entity->sched_data; + + if(!bfq_update_next_in_service(sd, NULL)) + break; + } + + return bfqq; +} + +static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) +{ + struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; + struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; + struct bfq_entity *entity = in_serv_entity; + + if (bfqd->in_service_bic) { + put_io_context(bfqd->in_service_bic->icq.ioc); + bfqd->in_service_bic = NULL; + } + + bfq_clear_bfqq_wait_request(in_serv_bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + bfqd->in_service_queue = NULL; + + /* + * When this function is called, all in-service entities have + * been properly deactivated or requeued, so we can safely + * execute the final step: reset in_service_entity along the + * path from entity to the root. + */ + for_each_entity(entity) + entity->sched_data->in_service_entity = NULL; + + /* + * in_serv_entity is no longer in service, so, if it is in no + * service tree either, then release the service reference to + * the queue it represents (taken with bfq_get_entity). + */ + if (!in_serv_entity->on_st) + bfq_put_queue(in_serv_bfqq); +} + +static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool ins_into_idle_tree, bool expiration) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); +} + +static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + BUG_ON(bfqq == bfqd->in_service_queue); + BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && + entity->on_st); + + bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), + false); + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); +} + +static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_activate_requeue_entity(entity, false, + bfqq == bfqd->in_service_queue); +} + +static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); + +/* + * Called when the bfqq no longer has requests pending, remove it from + * the service tree. As a special case, it can be invoked during an + * expiration. + */ +static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool expiration) +{ + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + bfq_log_bfqq(bfqd, bfqq, "del from busy"); + + bfq_clear_bfqq_busy(bfqq); + + BUG_ON(bfqd->busy_queues == 0); + bfqd->busy_queues--; + + if (!bfqq->dispatched) + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + + if (bfqq->wr_coeff > 1) { + bfqd->wr_busy_queues--; + BUG_ON(bfqd->wr_busy_queues < 0); + } + + bfqg_stats_update_dequeue(bfqq_group(bfqq)); + + BUG_ON(bfqq->entity.budget < 0); + + bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); +} + +/* + * Called when an inactive queue receives a new request. + */ +static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfq_bfqq_busy(bfqq)); + BUG_ON(bfqq == bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "add to busy"); + + bfq_activate_bfqq(bfqd, bfqq); + + bfq_mark_bfqq_busy(bfqq); + bfqd->busy_queues++; + + if (!bfqq->dispatched) + if (bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + + if (bfqq->wr_coeff > 1) { + bfqd->wr_busy_queues++; + BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); + } + +} diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c new file mode 100644 index 000000000000..65e7c7e77f3c --- /dev/null +++ b/block/bfq-sq-iosched.c @@ -0,0 +1,5379 @@ +/* + * Budget Fair Queueing (BFQ) I/O scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2017 Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. + * + * BFQ is a proportional-share I/O scheduler, with some extra + * low-latency capabilities. BFQ also supports full hierarchical + * scheduling through cgroups. Next paragraphs provide an introduction + * on BFQ inner workings. Details on BFQ benefits and usage can be + * found in Documentation/block/bfq-iosched.txt. + * + * BFQ is a proportional-share storage-I/O scheduling algorithm based + * on the slice-by-slice service scheme of CFQ. But BFQ assigns + * budgets, measured in number of sectors, to processes instead of + * time slices. The device is not granted to the in-service process + * for a given time slice, but until it has exhausted its assigned + * budget. This change from the time to the service domain enables BFQ + * to distribute the device throughput among processes as desired, + * without any distortion due to throughput fluctuations, or to device + * internal queueing. BFQ uses an ad hoc internal scheduler, called + * B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated with processes. Thanks to + * the accurate policy of B-WF2Q+, BFQ can afford to assign high + * budgets to I/O-bound processes issuing sequential requests (to + * boost the throughput), and yet guarantee a low latency to + * interactive and soft real-time applications. + * + * NOTE: if the main or only goal, with a given device, is to achieve + * the maximum-possible throughput at all times, then do switch off + * all low-latency heuristics for that device, by setting low_latency + * to 0. + * + * BFQ is described in [1], where also a reference to the initial, more + * theoretical paper on BFQ can be found. The interested reader can find + * in the latter paper full details on the main algorithm, as well as + * formulas of the guarantees and formal proofs of all the properties. + * With respect to the version of BFQ presented in these papers, this + * implementation adds a few more heuristics, such as the one that + * guarantees a low latency to soft real-time applications, and a + * hierarchical extension based on H-WF2Q+. + * + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) + * complexity derives from the one introduced with EEVDF in [3]. + * + * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O + * Scheduler", Proceedings of the First Workshop on Mobile System + * Technologies (MST-2015), May 2015. + * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf + * + * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf + * + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing + * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, + * Oct 1997. + * + * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz + * + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline + * First: A Flexible and Accurate Mechanism for Proportional Share + * Resource Allocation,'' technical report. + * + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "blk.h" +#include "bfq.h" + +/* Expiration time of sync (0) and async (1) requests, in ns. */ +static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; + +/* Maximum backwards seek, in KiB. */ +static const int bfq_back_max = (16 * 1024); + +/* Penalty of a backwards seek, in number of sectors. */ +static const int bfq_back_penalty = 2; + +/* Idling period duration, in ns. */ +static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); + +/* Minimum number of assigned budgets for which stats are safe to compute. */ +static const int bfq_stats_min_budgets = 194; + +/* Default maximum budget values, in sectors and number of requests. */ +static const int bfq_default_max_budget = (16 * 1024); + +/* + * Async to sync throughput distribution is controlled as follows: + * when an async request is served, the entity is charged the number + * of sectors of the request, multiplied by the factor below + */ +static const int bfq_async_charge_factor = 10; + +/* Default timeout values, in jiffies, approximating CFQ defaults. */ +static const int bfq_timeout = (HZ / 8); + +static struct kmem_cache *bfq_pool; + +/* Below this threshold (in ns), we consider thinktime immediate. */ +#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) + +/* hw_tag detection: parallel requests threshold and min samples needed. */ +#define BFQ_HW_QUEUE_THRESHOLD 4 +#define BFQ_HW_QUEUE_SAMPLES 32 + +#define BFQQ_SEEK_THR (sector_t)(8 * 100) +#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) +#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) + +/* Min number of samples required to perform peak-rate update */ +#define BFQ_RATE_MIN_SAMPLES 32 +/* Min observation time interval required to perform a peak-rate update (ns) */ +#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) +/* Target observation time interval for a peak-rate update (ns) */ +#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC + +/* Shift used for peak rate fixed precision calculations. */ +#define BFQ_RATE_SHIFT 16 + +/* + * By default, BFQ computes the duration of the weight raising for + * interactive applications automatically, using the following formula: + * duration = (R / r) * T, where r is the peak rate of the device, and + * R and T are two reference parameters. + * In particular, R is the peak rate of the reference device (see below), + * and T is a reference time: given the systems that are likely to be + * installed on the reference device according to its speed class, T is + * about the maximum time needed, under BFQ and while reading two files in + * parallel, to load typical large applications on these systems. + * In practice, the slower/faster the device at hand is, the more/less it + * takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to interactive + * applications. + * + * BFQ uses four different reference pairs (R, T), depending on: + * . whether the device is rotational or non-rotational; + * . whether the device is slow, such as old or portable HDDs, as well as + * SD cards, or fast, such as newer HDDs and SSDs. + * + * The device's speed class is dynamically (re)detected in + * bfq_update_peak_rate() every time the estimated peak rate is updated. + * + * In the following definitions, R_slow[0]/R_fast[0] and + * T_slow[0]/T_fast[0] are the reference values for a slow/fast + * rotational device, whereas R_slow[1]/R_fast[1] and + * T_slow[1]/T_fast[1] are the reference values for a slow/fast + * non-rotational device. Finally, device_speed_thresh are the + * thresholds used to switch between speed classes. The reference + * rates are not the actual peak rates of the devices used as a + * reference, but slightly lower values. The reason for using these + * slightly lower values is that the peak-rate estimator tends to + * yield slightly lower values than the actual peak rate (it can yield + * the actual peak rate only if there is only one process doing I/O, + * and the process does sequential I/O). + * + * Both the reference peak rates and the thresholds are measured in + * sectors/usec, left-shifted by BFQ_RATE_SHIFT. + */ +static int R_slow[2] = {1000, 10700}; +static int R_fast[2] = {14000, 33000}; +/* + * To improve readability, a conversion function is used to initialize the + * following arrays, which entails that they can be initialized only in a + * function. + */ +static int T_slow[2]; +static int T_fast[2]; +static int device_speed_thresh[2]; + +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) +#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + +static void bfq_schedule_dispatch(struct bfq_data *bfqd); + +#include "bfq-ioc.c" +#include "bfq-sched.c" +#include "bfq-cgroup-included.c" + +#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) +#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) + +#define bfq_sample_valid(samples) ((samples) > 80) + +/* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. + */ +static void bfq_schedule_dispatch(struct bfq_data *bfqd) +{ + if (bfqd->queued != 0) { + bfq_log(bfqd, "schedule dispatch"); + kblockd_schedule_work(&bfqd->unplug_work); + } +} + +/* + * Lifted from AS - choose which of rq1 and rq2 that is best served now. + * We choose the request that is closesr to the head right now. Distance + * behind the head is penalized and only allowed to a certain extent. + */ +static struct request *bfq_choose_req(struct bfq_data *bfqd, + struct request *rq1, + struct request *rq2, + sector_t last) +{ + sector_t s1, s2, d1 = 0, d2 = 0; + unsigned long back_max; +#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ +#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ + unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ + + if (!rq1 || rq1 == rq2) + return rq2; + if (!rq2) + return rq1; + + if (rq_is_sync(rq1) && !rq_is_sync(rq2)) + return rq1; + else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) + return rq2; + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) + return rq1; + else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) + return rq2; + + s1 = blk_rq_pos(rq1); + s2 = blk_rq_pos(rq2); + + /* + * By definition, 1KiB is 2 sectors. + */ + back_max = bfqd->bfq_back_max * 2; + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. + */ + if (s1 >= last) + d1 = s1 - last; + else if (s1 + back_max >= last) + d1 = (last - s1) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ1_WRAP; + + if (s2 >= last) + d2 = s2 - last; + else if (s2 + back_max >= last) + d2 = (last - s2) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ2_WRAP; + + /* Found required data */ + + /* + * By doing switch() on the bit mask "wrap" we avoid having to + * check two variables for all permutations: --> faster! + */ + switch (wrap) { + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ + if (d1 < d2) + return rq1; + else if (d2 < d1) + return rq2; + + if (s1 >= s2) + return rq1; + else + return rq2; + + case BFQ_RQ2_WRAP: + return rq1; + case BFQ_RQ1_WRAP: + return rq2; + case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ + default: + /* + * Since both rqs are wrapped, + * start with the one that's further behind head + * (--> only *one* back seek required), + * since back seek takes more time than forward. + */ + if (s1 <= s2) + return rq1; + else + return rq2; + } +} + +static struct bfq_queue * +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, + struct rb_node ***rb_link) +{ + struct rb_node **p, *parent; + struct bfq_queue *bfqq = NULL; + + parent = NULL; + p = &root->rb_node; + while (*p) { + struct rb_node **n; + + parent = *p; + bfqq = rb_entry(parent, struct bfq_queue, pos_node); + + /* + * Sort strictly based on sector. Smallest to the left, + * largest to the right. + */ + if (sector > blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_right; + else if (sector < blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_left; + else + break; + p = n; + bfqq = NULL; + } + + *ret_parent = parent; + if (rb_link) + *rb_link = p; + + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", + (unsigned long long) sector, + bfqq ? bfqq->pid : 0); + + return bfqq; +} + +static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct rb_node **p, *parent; + struct bfq_queue *__bfqq; + + if (bfqq->pos_root) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) + return; + + bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, + blk_rq_pos(bfqq->next_rq), &parent, &p); + if (!__bfqq) { + rb_link_node(&bfqq->pos_node, parent, p); + rb_insert_color(&bfqq->pos_node, bfqq->pos_root); + } else + bfqq->pos_root = NULL; +} + +/* + * Tell whether there are active queues or groups with differentiated weights. + */ +static bool bfq_differentiated_weights(struct bfq_data *bfqd) +{ + /* + * For weights to differ, at least one of the trees must contain + * at least two nodes. + */ + return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && + (bfqd->queue_weights_tree.rb_node->rb_left || + bfqd->queue_weights_tree.rb_node->rb_right) +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + ) || + (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && + (bfqd->group_weights_tree.rb_node->rb_left || + bfqd->group_weights_tree.rb_node->rb_right) +#endif + ); +} + +/* + * The following function returns true if every queue must receive the + * same share of the throughput (this condition is used when deciding + * whether idling may be disabled, see the comments in the function + * bfq_bfqq_may_idle()). + * + * Such a scenario occurs when: + * 1) all active queues have the same weight, + * 2) all active groups at the same level in the groups tree have the same + * weight, + * 3) all active groups at the same level in the groups tree have the same + * number of children. + * + * Unfortunately, keeping the necessary state for evaluating exactly the + * above symmetry conditions would be quite complex and time-consuming. + * Therefore this function evaluates, instead, the following stronger + * sub-conditions, for which it is much easier to maintain the needed + * state: + * 1) all active queues have the same weight, + * 2) all active groups have the same weight, + * 3) all active groups have at most one active child each. + * In particular, the last two conditions are always true if hierarchical + * support and the cgroups interface are not enabled, thus no state needs + * to be maintained in this case. + */ +static bool bfq_symmetric_scenario(struct bfq_data *bfqd) +{ + return !bfq_differentiated_weights(bfqd); +} + +/* + * If the weight-counter tree passed as input contains no counter for + * the weight of the input entity, then add that counter; otherwise just + * increment the existing counter. + * + * Note that weight-counter trees contain few nodes in mostly symmetric + * scenarios. For example, if all queues have the same weight, then the + * weight-counter tree for the queues may contain at most one node. + * This holds even if low_latency is on, because weight-raised queues + * are not inserted in the tree. + * In most scenarios, the rate at which nodes are created/destroyed + * should be low too. + */ +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* + * Do not insert if the entity is already associated with a + * counter, which happens if: + * 1) the entity is associated with a queue, + * 2) a request arrival has caused the queue to become both + * non-weight-raised, and hence change its weight, and + * backlogged; in this respect, each of the two events + * causes an invocation of this function, + * 3) this is the invocation of this function caused by the + * second event. This second invocation is actually useless, + * and we handle this fact by exiting immediately. More + * efficient or clearer solutions might possibly be adopted. + */ + if (entity->weight_counter) + return; + + while (*new) { + struct bfq_weight_counter *__counter = container_of(*new, + struct bfq_weight_counter, + weights_node); + parent = *new; + + if (entity->weight == __counter->weight) { + entity->weight_counter = __counter; + goto inc_counter; + } + if (entity->weight < __counter->weight) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), + GFP_ATOMIC); + + /* + * In the unlucky event of an allocation failure, we just + * exit. This will cause the weight of entity to not be + * considered in bfq_differentiated_weights, which, in its + * turn, causes the scenario to be deemed wrongly symmetric in + * case entity's weight would have been the only weight making + * the scenario asymmetric. On the bright side, no unbalance + * will however occur when entity becomes inactive again (the + * invocation of this function is triggered by an activation + * of entity). In fact, bfq_weights_tree_remove does nothing + * if !entity->weight_counter. + */ + if (unlikely(!entity->weight_counter)) + return; + + entity->weight_counter->weight = entity->weight; + rb_link_node(&entity->weight_counter->weights_node, parent, new); + rb_insert_color(&entity->weight_counter->weights_node, root); + +inc_counter: + entity->weight_counter->num_active++; +} + +/* + * Decrement the weight counter associated with the entity, and, if the + * counter reaches 0, remove the counter from the tree. + * See the comments to the function bfq_weights_tree_add() for considerations + * about overhead. + */ +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + if (!entity->weight_counter) + return; + + BUG_ON(RB_EMPTY_ROOT(root)); + BUG_ON(entity->weight_counter->weight != entity->weight); + + BUG_ON(!entity->weight_counter->num_active); + entity->weight_counter->num_active--; + if (entity->weight_counter->num_active > 0) + goto reset_entity_pointer; + + rb_erase(&entity->weight_counter->weights_node, root); + kfree(entity->weight_counter); + +reset_entity_pointer: + entity->weight_counter = NULL; +} + +/* + * Return expired entry, or NULL to just start from scratch in rbtree. + */ +static struct request *bfq_check_fifo(struct bfq_queue *bfqq, + struct request *last) +{ + struct request *rq; + + if (bfq_bfqq_fifo_expire(bfqq)) + return NULL; + + bfq_mark_bfqq_fifo_expire(bfqq); + + rq = rq_entry_fifo(bfqq->fifo.next); + + if (rq == last || ktime_get_ns() < rq->fifo_time) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); + BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); + return rq; +} + +static struct request *bfq_find_next_rq(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *last) +{ + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct request *next, *prev = NULL; + + BUG_ON(list_empty(&bfqq->fifo)); + + /* Follow expired path, else get first next available. */ + next = bfq_check_fifo(bfqq, last); + if (next) { + BUG_ON(next == last); + return next; + } + + BUG_ON(RB_EMPTY_NODE(&last->rb_node)); + + if (rbprev) + prev = rb_entry_rq(rbprev); + + if (rbnext) + next = rb_entry_rq(rbnext); + else { + rbnext = rb_first(&bfqq->sort_list); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } + + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); +} + +/* see the definition of bfq_async_charge_factor for details */ +static unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) +{ + if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) + return blk_rq_sectors(rq); + + /* + * If there are no weight-raised queues, then amplify service + * by just the async charge factor; otherwise amplify service + * by twice the async charge factor, to further reduce latency + * for weight-raised queues. + */ + if (bfqq->bfqd->wr_busy_queues == 0) + return blk_rq_sectors(rq) * bfq_async_charge_factor; + + return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; +} + +/** + * bfq_updated_next_req - update the queue after a new next_rq selection. + * @bfqd: the device data the queue belongs to. + * @bfqq: the queue to update. + * + * If the first request of a queue changes we make sure that the queue + * has enough budget to serve at least its first request (if the + * request has grown). We do this because if the queue has not enough + * budget for its first request, it has to go through two dispatch + * rounds to actually get it dispatched. + */ +static void bfq_updated_next_req(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct request *next_rq = bfqq->next_rq; + unsigned long new_budget; + + if (!next_rq) + return; + + if (bfqq == bfqd->in_service_queue) + /* + * In order not to break guarantees, budgets cannot be + * changed after an entity has been selected. + */ + return; + + BUG_ON(entity->tree != &st->active); + BUG_ON(entity == entity->sched_data->in_service_entity); + + new_budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + if (entity->budget != new_budget) { + entity->budget = new_budget; + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", + new_budget); + bfq_requeue_bfqq(bfqd, bfqq); + } +} + +static unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_wr_max_time > 0) + return bfqd->bfq_wr_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + + /* + * Limit duration between 3 and 13 seconds. Tests show that + * higher values than 13 seconds often yield the opposite of + * the desired result, i.e., worsen responsiveness by letting + * non-interactive and non-soft-real-time applications + * preserve weight raising for a too long time interval. + * + * On the other end, lower values than 3 seconds make it + * difficult for most interactive tasks to complete their jobs + * before weight-raising finishes. + */ + if (dur > msecs_to_jiffies(13000)) + dur = msecs_to_jiffies(13000); + else if (dur < msecs_to_jiffies(3000)) + dur = msecs_to_jiffies(3000); + + return dur; +} + +static void +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + struct bfq_io_cq *bic, bool bfq_already_existing) +{ + unsigned int old_wr_coeff; + bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); + + if (bic->saved_idle_window) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); + + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); + + if (unlikely(busy)) + old_wr_coeff = bfqq->wr_coeff; + + bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); + bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; + bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr (%lu + %lu < %lu)", + bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, + jiffies); + + bfqq->wr_coeff = 1; + } + + /* make sure weight will be updated, however we got here */ + bfqq->entity.prio_changed = 1; + + if (likely(!busy)) + return; + + if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { + bfqd->wr_busy_queues++; + BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); + } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { + bfqd->wr_busy_queues--; + BUG_ON(bfqd->wr_busy_queues < 0); + } +} + +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + int process_refs, io_refs; + + lockdep_assert_held(bfqq->bfqd->queue->queue_lock); + + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; + process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; + BUG_ON(process_refs < 0); + return process_refs; +} + +/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ +static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_queue *item; + struct hlist_node *n; + + hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) + hlist_del_init(&item->burst_list_node); + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); + bfqd->burst_size = 1; + bfqd->burst_parent_entity = bfqq->entity.parent; +} + +/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ +static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* Increment burst size to take into account also bfqq */ + bfqd->burst_size++; + + bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); + + BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); + + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { + struct bfq_queue *pos, *bfqq_item; + struct hlist_node *n; + + /* + * Enough queues have been activated shortly after each + * other to consider this burst as large. + */ + bfqd->large_burst = true; + bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); + + /* + * We can now mark all queues in the burst list as + * belonging to a large burst. + */ + hlist_for_each_entry(bfqq_item, &bfqd->burst_list, + burst_list_node) { + bfq_mark_bfqq_in_large_burst(bfqq_item); + bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); + } + bfq_mark_bfqq_in_large_burst(bfqq); + bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); + + /* + * From now on, and until the current burst finishes, any + * new queue being activated shortly after the last queue + * was inserted in the burst can be immediately marked as + * belonging to a large burst. So the burst list is not + * needed any more. Remove it. + */ + hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, + burst_list_node) + hlist_del_init(&pos->burst_list_node); + } else /* + * Burst not yet large: add bfqq to the burst list. Do + * not increment the ref counter for bfqq, because bfqq + * is removed from the burst list before freeing bfqq + * in put_queue. + */ + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); +} + +/* + * If many queues belonging to the same group happen to be created + * shortly after each other, then the processes associated with these + * queues have typically a common goal. In particular, bursts of queue + * creations are usually caused by services or applications that spawn + * many parallel threads/processes. Examples are systemd during boot, + * or git grep. To help these processes get their job done as soon as + * possible, it is usually better to not grant either weight-raising + * or device idling to their queues. + * + * In this comment we describe, firstly, the reasons why this fact + * holds, and, secondly, the next function, which implements the main + * steps needed to properly mark these queues so that they can then be + * treated in a different way. + * + * The above services or applications benefit mostly from a high + * throughput: the quicker the requests of the activated queues are + * cumulatively served, the sooner the target job of these queues gets + * completed. As a consequence, weight-raising any of these queues, + * which also implies idling the device for it, is almost always + * counterproductive. In most cases it just lowers throughput. + * + * On the other hand, a burst of queue creations may be caused also by + * the start of an application that does not consist of a lot of + * parallel I/O-bound threads. In fact, with a complex application, + * several short processes may need to be executed to start-up the + * application. In this respect, to start an application as quickly as + * possible, the best thing to do is in any case to privilege the I/O + * related to the application with respect to all other + * I/O. Therefore, the best strategy to start as quickly as possible + * an application that causes a burst of queue creations is to + * weight-raise all the queues created during the burst. This is the + * exact opposite of the best strategy for the other type of bursts. + * + * In the end, to take the best action for each of the two cases, the + * two types of bursts need to be distinguished. Fortunately, this + * seems relatively easy, by looking at the sizes of the bursts. In + * particular, we found a threshold such that only bursts with a + * larger size than that threshold are apparently caused by + * services or commands such as systemd or git grep. For brevity, + * hereafter we call just 'large' these bursts. BFQ *does not* + * weight-raise queues whose creation occurs in a large burst. In + * addition, for each of these queues BFQ performs or does not perform + * idling depending on which choice boosts the throughput more. The + * exact choice depends on the device and request pattern at + * hand. + * + * Unfortunately, false positives may occur while an interactive task + * is starting (e.g., an application is being started). The + * consequence is that the queues associated with the task do not + * enjoy weight raising as expected. Fortunately these false positives + * are very rare. They typically occur if some service happens to + * start doing I/O exactly when the interactive task starts. + * + * Turning back to the next function, it implements all the steps + * needed to detect the occurrence of a large burst and to properly + * mark all the queues belonging to it (so that they can then be + * treated in a different way). This goal is achieved by maintaining a + * "burst list" that holds, temporarily, the queues that belong to the + * burst in progress. The list is then used to mark these queues as + * belonging to a large burst if the burst does become large. The main + * steps are the following. + * + * . when the very first queue is created, the queue is inserted into the + * list (as it could be the first queue in a possible burst) + * + * . if the current burst has not yet become large, and a queue Q that does + * not yet belong to the burst is activated shortly after the last time + * at which a new queue entered the burst list, then the function appends + * Q to the burst list + * + * . if, as a consequence of the previous step, the burst size reaches + * the large-burst threshold, then + * + * . all the queues in the burst list are marked as belonging to a + * large burst + * + * . the burst list is deleted; in fact, the burst list already served + * its purpose (keeping temporarily track of the queues in a burst, + * so as to be able to mark them as belonging to a large burst in the + * previous sub-step), and now is not needed any more + * + * . the device enters a large-burst mode + * + * . if a queue Q that does not belong to the burst is created while + * the device is in large-burst mode and shortly after the last time + * at which a queue either entered the burst list or was marked as + * belonging to the current large burst, then Q is immediately marked + * as belonging to a large burst. + * + * . if a queue Q that does not belong to the burst is created a while + * later, i.e., not shortly after, than the last time at which a queue + * either entered the burst list or was marked as belonging to the + * current large burst, then the current burst is deemed as finished and: + * + * . the large-burst mode is reset if set + * + * . the burst list is emptied + * + * . Q is inserted in the burst list, as Q may be the first queue + * in a possible new burst (then the burst list contains just Q + * after this step). + */ +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* + * If bfqq is already in the burst list or is part of a large + * burst, or finally has just been split, then there is + * nothing else to do. + */ + if (!hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq) || + time_is_after_eq_jiffies(bfqq->split_time + + msecs_to_jiffies(10))) + return; + + /* + * If bfqq's creation happens late enough, or bfqq belongs to + * a different group than the burst group, then the current + * burst is finished, and related data structures must be + * reset. + * + * In this respect, consider the special case where bfqq is + * the very first queue created after BFQ is selected for this + * device. In this case, last_ins_in_burst and + * burst_parent_entity are not yet significant when we get + * here. But it is easy to verify that, whether or not the + * following condition is true, bfqq will end up being + * inserted into the burst list. In particular the list will + * happen to contain only bfqq. And this is exactly what has + * to happen, as bfqq may be the first queue of the first + * burst. + */ + if (time_is_before_jiffies(bfqd->last_ins_in_burst + + bfqd->bfq_burst_interval) || + bfqq->entity.parent != bfqd->burst_parent_entity) { + bfqd->large_burst = false; + bfq_reset_burst_list(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, + "handle_burst: late activation or different group"); + goto end; + } + + /* + * If we get here, then bfqq is being activated shortly after the + * last queue. So, if the current burst is also large, we can mark + * bfqq as belonging to this large burst immediately. + */ + if (bfqd->large_burst) { + bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); + bfq_mark_bfqq_in_large_burst(bfqq); + goto end; + } + + /* + * If we get here, then a large-burst state has not yet been + * reached, but bfqq is being activated shortly after the last + * queue. Then we add bfqq to the burst. + */ + bfq_add_to_burst(bfqd, bfqq); +end: + /* + * At this point, bfqq either has been added to the current + * burst or has caused the current burst to terminate and a + * possible new burst to start. In particular, in the second + * case, bfqq has become the first queue in the possible new + * burst. In both cases last_ins_in_burst needs to be moved + * forward. + */ + bfqd->last_ins_in_burst = jiffies; + +} + +static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + return entity->budget - entity->service; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static int bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static int bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason); + +/* + * The next function, invoked after the input queue bfqq switches from + * idle to busy, updates the budget of bfqq. The function also tells + * whether the in-service queue should be expired, by returning + * true. The purpose of expiring the in-service queue is to give bfqq + * the chance to possibly preempt the in-service queue, and the reason + * for preempting the in-service queue is to achieve one of the two + * goals below. + * + * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has + * expired because it has remained idle. In particular, bfqq may have + * expired for one of the following two reasons: + * + * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and + * did not make it to issue a new request before its last request + * was served; + * + * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue + * a new request before the expiration of the idling-time. + * + * Even if bfqq has expired for one of the above reasons, the process + * associated with the queue may be however issuing requests greedily, + * and thus be sensitive to the bandwidth it receives (bfqq may have + * remained idle for other reasons: CPU high load, bfqq not enjoying + * idling, I/O throttling somewhere in the path from the process to + * the I/O scheduler, ...). But if, after every expiration for one of + * the above two reasons, bfqq has to wait for the service of at least + * one full budget of another queue before being served again, then + * bfqq is likely to get a much lower bandwidth or resource time than + * its reserved ones. To address this issue, two countermeasures need + * to be taken. + * + * First, the budget and the timestamps of bfqq need to be updated in + * a special way on bfqq reactivation: they need to be updated as if + * bfqq did not remain idle and did not expire. In fact, if they are + * computed as if bfqq expired and remained idle until reactivation, + * then the process associated with bfqq is treated as if, instead of + * being greedy, it stopped issuing requests when bfqq remained idle, + * and restarts issuing requests only on this reactivation. In other + * words, the scheduler does not help the process recover the "service + * hole" between bfqq expiration and reactivation. As a consequence, + * the process receives a lower bandwidth than its reserved one. In + * contrast, to recover this hole, the budget must be updated as if + * bfqq was not expired at all before this reactivation, i.e., it must + * be set to the value of the remaining budget when bfqq was + * expired. Along the same line, timestamps need to be assigned the + * value they had the last time bfqq was selected for service, i.e., + * before last expiration. Thus timestamps need to be back-shifted + * with respect to their normal computation (see [1] for more details + * on this tricky aspect). + * + * Secondly, to allow the process to recover the hole, the in-service + * queue must be expired too, to give bfqq the chance to preempt it + * immediately. In fact, if bfqq has to wait for a full budget of the + * in-service queue to be completed, then it may become impossible to + * let the process recover the hole, even if the back-shifted + * timestamps of bfqq are lower than those of the in-service queue. If + * this happens for most or all of the holes, then the process may not + * receive its reserved bandwidth. In this respect, it is worth noting + * that, being the service of outstanding requests unpreemptible, a + * little fraction of the holes may however be unrecoverable, thereby + * causing a little loss of bandwidth. + * + * The last important point is detecting whether bfqq does need this + * bandwidth recovery. In this respect, the next function deems the + * process associated with bfqq greedy, and thus allows it to recover + * the hole, if: 1) the process is waiting for the arrival of a new + * request (which implies that bfqq expired for one of the above two + * reasons), and 2) such a request has arrived soon. The first + * condition is controlled through the flag non_blocking_wait_rq, + * while the second through the flag arrived_in_time. If both + * conditions hold, then the function computes the budget in the + * above-described special way, and signals that the in-service queue + * should be expired. Timestamp back-shifting is done later in + * __bfq_activate_entity. + * + * 2. Reduce latency. Even if timestamps are not backshifted to let + * the process associated with bfqq recover a service hole, bfqq may + * however happen to have, after being (re)activated, a lower finish + * timestamp than the in-service queue. That is, the next budget of + * bfqq may have to be completed before the one of the in-service + * queue. If this is the case, then preempting the in-service queue + * allows this goal to be achieved, apart from the unpreemptible, + * outstanding requests mentioned above. + * + * Unfortunately, regardless of which of the above two goals one wants + * to achieve, service trees need first to be updated to know whether + * the in-service queue must be preempted. To have service trees + * correctly updated, the in-service queue must be expired and + * rescheduled, and bfqq must be scheduled too. This is one of the + * most costly operations (in future versions, the scheduling + * mechanism may be re-designed in such a way to make it possible to + * know whether preemption is needed without needing to update service + * trees). In addition, queue preemptions almost always cause random + * I/O, and thus loss of throughput. Because of these facts, the next + * function adopts the following simple scheme to avoid both costly + * operations and too frequent preemptions: it requests the expiration + * of the in-service queue (unconditionally) only for queues that need + * to recover a hole, or that either are weight-raised or deserve to + * be weight-raised. + */ +static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool arrived_in_time, + bool wr_or_deserves_wr) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { + /* + * We do not clear the flag non_blocking_wait_rq here, as + * the latter is used in bfq_activate_bfqq to signal + * that timestamps need to be back-shifted (and is + * cleared right after). + */ + + /* + * In next assignment we rely on that either + * entity->service or entity->budget are not updated + * on expiration if bfqq is empty (see + * __bfq_bfqq_recalc_budget). Thus both quantities + * remain unchanged after such an expiration, and the + * following statement therefore assigns to + * entity->budget the remaining budget on such an + * expiration. For clarity, entity->service is not + * updated on expiration in any case, and, in normal + * operation, is reset only when bfqq is selected for + * service (see bfq_get_next_queue). + */ + BUG_ON(bfqq->max_budget < 0); + entity->budget = min_t(unsigned long, + bfq_bfqq_budget_left(bfqq), + bfqq->max_budget); + + BUG_ON(entity->budget < 0); + return true; + } + + BUG_ON(bfqq->max_budget < 0); + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(bfqq->next_rq, bfqq)); + BUG_ON(entity->budget < 0); + + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); + return wr_or_deserves_wr; +} + +static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + unsigned int old_wr_coeff, + bool wr_or_deserves_wr, + bool interactive, + bool in_burst, + bool soft_rt) +{ + if (old_wr_coeff == 1 && wr_or_deserves_wr) { + /* start a weight-raising period */ + if (interactive) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else { + bfqq->wr_start_at_switch_to_srt = jiffies; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + /* + * If needed, further reduce budget to make sure it is + * close to bfqq's backlog, so as to reduce the + * scheduling-error component due to a too large + * budget. Do not care about throughput consequences, + * but only about latency. Finally, do not assign a + * too small budget either, to avoid increasing + * latency by causing too frequent expirations. + */ + bfqq->entity.budget = min_t(unsigned long, + bfqq->entity.budget, + 2 * bfq_min_budget(bfqd)); + + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } else if (old_wr_coeff > 1) { + if (interactive) { /* update wr coeff and duration */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else if (in_burst) { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq-> + wr_cur_max_time)); + } else if (soft_rt) { + /* + * The application is now or still meeting the + * requirements for being deemed soft rt. We + * can then correctly and safely (re)charge + * the weight-raising duration for the + * application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + if (bfqq->wr_cur_max_time != + bfqd->bfq_wr_rt_max_time) { + bfqq->wr_start_at_switch_to_srt = + bfqq->last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfq_log_bfqq(bfqd, bfqq, + "switching to soft_rt wr"); + } else + bfq_log_bfqq(bfqd, bfqq, + "moving forward soft_rt wr duration"); + bfqq->last_wr_start_finish = jiffies; + } + } +} + +static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return bfqq->dispatched == 0 && + time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); +} + +static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int old_wr_coeff, + struct request *rq, + bool *interactive) +{ + bool soft_rt, in_burst, wr_or_deserves_wr, + bfqq_wants_to_preempt, + idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), + /* + * See the comments on + * bfq_bfqq_update_budg_for_activation for + * details on the usage of the next variable. + */ + arrived_in_time = ktime_get_ns() <= + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle * 3; + + bfq_log_bfqq(bfqd, bfqq, + "bfq_add_request non-busy: " + "jiffies %lu, in_time %d, idle_long %d busyw %d " + "wr_coeff %u", + jiffies, arrived_in_time, + idle_for_long_time, + bfq_bfqq_non_blocking_wait_rq(bfqq), + old_wr_coeff); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + BUG_ON(bfqq == bfqd->in_service_queue); + bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); + + /* + * bfqq deserves to be weight-raised if: + * - it is sync, + * - it does not belong to a large burst, + * - it has been idle for enough time or is soft real-time, + * - is linked to a bfq_io_cq (it is not shared in any sense) + */ + in_burst = bfq_bfqq_in_large_burst(bfqq); + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); + *interactive = + !in_burst && + idle_for_long_time; + wr_or_deserves_wr = bfqd->low_latency && + (bfqq->wr_coeff > 1 || + (bfq_bfqq_sync(bfqq) && + bfqq->bic && (*interactive || soft_rt))); + + bfq_log_bfqq(bfqd, bfqq, + "bfq_add_request: " + "in_burst %d, " + "soft_rt %d (next %lu), inter %d, bic %p", + bfq_bfqq_in_large_burst(bfqq), soft_rt, + bfqq->soft_rt_next_start, + *interactive, + bfqq->bic); + + /* + * Using the last flag, update budget and check whether bfqq + * may want to preempt the in-service queue. + */ + bfqq_wants_to_preempt = + bfq_bfqq_update_budg_for_activation(bfqd, bfqq, + arrived_in_time, + wr_or_deserves_wr); + + /* + * If bfqq happened to be activated in a burst, but has been + * idle for much more than an interactive queue, then we + * assume that, in the overall I/O initiated in the burst, the + * I/O associated with bfqq is finished. So bfqq does not need + * to be treated as a queue belonging to a burst + * anymore. Accordingly, we reset bfqq's in_large_burst flag + * if set, and remove bfqq from the burst list if it's + * there. We do not decrement burst_size, because the fact + * that bfqq does not need to belong to the burst list any + * more does not invalidate the fact that bfqq was created in + * a burst. + */ + if (likely(!bfq_bfqq_just_created(bfqq)) && + idle_for_long_time && + time_is_before_jiffies( + bfqq->budget_timeout + + msecs_to_jiffies(10000))) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + + bfq_clear_bfqq_just_created(bfqq); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (arrived_in_time) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + bfq_log_bfqq(bfqd, bfqq, "requests in time %d", + bfqq->requests_within_timer); + } + + if (bfqd->low_latency) { + if (unlikely(time_is_after_jiffies(bfqq->split_time))) + /* wraparound */ + bfqq->split_time = + jiffies - bfqd->bfq_wr_min_idle_time - 1; + + if (time_is_before_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) { + bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, + old_wr_coeff, + wr_or_deserves_wr, + *interactive, + in_burst, + soft_rt); + + if (old_wr_coeff != bfqq->wr_coeff) + bfqq->entity.prio_changed = 1; + } + } + + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + + bfq_add_bfqq_busy(bfqd, bfqq); + + /* + * Expire in-service queue only if preemption may be needed + * for guarantees. In this respect, the function + * next_queue_may_preempt just checks a simple, necessary + * condition, and not a sufficient condition based on + * timestamps. In fact, for the latter condition to be + * evaluated, timestamps would need first to be updated, and + * this operation is quite costly (see the comments on the + * function bfq_bfqq_update_budg_for_activation). + */ + if (bfqd->in_service_queue && bfqq_wants_to_preempt && + bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && + next_queue_may_preempt(bfqd)) { + struct bfq_queue *in_serv = + bfqd->in_service_queue; + BUG_ON(in_serv == bfqq); + + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQ_BFQQ_PREEMPTED); + } +} + +static void bfq_add_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + struct request *next_rq, *prev; + unsigned int old_wr_coeff = bfqq->wr_coeff; + bool interactive = false; + + bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", + blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); + + if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + + elv_rb_add(&bfqq->sort_list, rq); + + /* + * Check if this request is a better next-to-serve candidate. + */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); + BUG_ON(!next_rq); + bfqq->next_rq = next_rq; + + /* + * Adjust priority tree position, if next_rq changes. + */ + if (prev != bfqq->next_rq) + bfq_pos_tree_add_move(bfqd, bfqq); + + if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ + bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, + rq, &interactive); + else { + if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && + time_is_before_jiffies( + bfqq->last_wr_start_finish + + bfqd->bfq_wr_min_inter_arr_async)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + + bfqd->wr_busy_queues++; + BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "non-idle wrais starting, " + "wr_max_time %u wr_busy %d", + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqd->wr_busy_queues); + } + if (prev != bfqq->next_rq) + bfq_updated_next_req(bfqd, bfqq); + } + + /* + * Assign jiffies to last_wr_start_finish in the following + * cases: + * + * . if bfqq is not going to be weight-raised, because, for + * non weight-raised queues, last_wr_start_finish stores the + * arrival time of the last request; as of now, this piece + * of information is used only for deciding whether to + * weight-raise async queues + * + * . if bfqq is not weight-raised, because, if bfqq is now + * switching to weight-raised, then last_wr_start_finish + * stores the time when weight-raising starts + * + * . if bfqq is interactive, because, regardless of whether + * bfqq is currently weight-raised, the weight-raising + * period must start or restart (this case is considered + * separately because it is not detected by the above + * conditions, if bfqq is already weight-raised) + * + * last_wr_start_finish has to be updated also if bfqq is soft + * real-time, because the weight-raising period is constantly + * restarted on idle-to-busy transitions for these queues, but + * this is already done in bfq_bfqq_handle_idle_busy_switch if + * needed. + */ + if (bfqd->low_latency && + (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) + bfqq->last_wr_start_finish = jiffies; +} + +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, + struct bio *bio) +{ + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (!bic) + return NULL; + + bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); + if (bfqq) + return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); + + return NULL; +} + +static sector_t get_sdist(sector_t last_pos, struct request *rq) +{ + sector_t sdist = 0; + + if (last_pos) { + if (last_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - last_pos; + else + sdist = last_pos - blk_rq_pos(rq); + } + + return sdist; +} + +static void bfq_activate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + bfqd->rq_in_driver++; +} + +static void bfq_deactivate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + BUG_ON(bfqd->rq_in_driver == 0); + bfqd->rq_in_driver--; +} + +static void bfq_remove_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + const int sync = rq_is_sync(rq); + + BUG_ON(bfqq->entity.service > bfqq->entity.budget && + bfqq == bfqd->in_service_queue); + + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); + bfq_updated_next_req(bfqd, bfqq); + } + + if (rq->queuelist.prev != &rq->queuelist) + list_del_init(&rq->queuelist); + BUG_ON(bfqq->queued[sync] == 0); + bfqq->queued[sync]--; + bfqd->queued--; + elv_rb_del(&bfqq->sort_list, rq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + bfqq->next_rq = NULL; + + BUG_ON(bfqq->entity.budget < 0); + + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { + BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ + bfq_del_bfqq_busy(bfqd, bfqq, false); + /* + * bfqq emptied. In normal operation, when + * bfqq is empty, bfqq->entity.service and + * bfqq->entity.budget must contain, + * respectively, the service received and the + * budget used last time bfqq emptied. These + * facts do not hold in this case, as at least + * this last removal occurred while bfqq is + * not in service. To avoid inconsistencies, + * reset both bfqq->entity.service and + * bfqq->entity.budget, if bfqq has still a + * process that may issue I/O requests to it. + */ + bfqq->entity.budget = bfqq->entity.service = 0; + } + + /* + * Remove queue from request-position tree as it is empty. + */ + if (bfqq->pos_root) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + } + + if (rq->cmd_flags & REQ_META) { + BUG_ON(bfqq->meta_pending == 0); + bfqq->meta_pending--; + } + bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); +} + +static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *__rq; + + __rq = bfq_find_rq_fmerge(bfqd, bio); + if (__rq && elv_bio_merge_ok(__rq, bio)) { + *req = __rq; + return ELEVATOR_FRONT_MERGE; + } + + return ELEVATOR_NO_MERGE; +} + +static void bfq_merged_request(struct request_queue *q, struct request *req, + enum elv_merge type) +{ + if (type == ELEVATOR_FRONT_MERGE && + rb_prev(&req->rb_node) && + blk_rq_pos(req) < + blk_rq_pos(container_of(rb_prev(&req->rb_node), + struct request, rb_node))) { + struct bfq_queue *bfqq = RQ_BFQQ(req); + struct bfq_data *bfqd = bfqq->bfqd; + struct request *prev, *next_rq; + + /* Reposition request in its sort_list */ + elv_rb_del(&bfqq->sort_list, req); + elv_rb_add(&bfqq->sort_list, req); + /* Choose next request to be served for bfqq */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, + bfqd->last_position); + BUG_ON(!next_rq); + bfqq->next_rq = next_rq; + /* + * If next_rq changes, update both the queue's budget to + * fit the new request and the queue's position in its + * rq_pos_tree. + */ + if (prev != bfqq->next_rq) { + bfq_updated_next_req(bfqd, bfqq); + bfq_pos_tree_add_move(bfqd, bfqq); + } + } +} + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +static void bfq_bio_merged(struct request_queue *q, struct request *req, + struct bio *bio) +{ + bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); +} +#endif + +static void bfq_merged_requests(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + + /* + * If next and rq belong to the same bfq_queue and next is older + * than rq, then reposition rq in the fifo (by substituting next + * with rq). Otherwise, if next and rq belong to different + * bfq_queues, never reposition rq: in fact, we would have to + * reposition it with respect to next's position in its own fifo, + * which would most certainly be too expensive with respect to + * the benefits. + */ + if (bfqq == next_bfqq && + !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && + next->fifo_time < rq->fifo_time) { + list_del_init(&rq->queuelist); + list_replace_init(&next->queuelist, &rq->queuelist); + rq->fifo_time = next->fifo_time; + } + + if (bfqq->next_rq == next) + bfqq->next_rq = rq; + + bfq_remove_request(next); + bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +} + +/* Must be called with bfqq != NULL */ +static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) +{ + BUG_ON(!bfqq); + + if (bfq_bfqq_busy(bfqq)) { + bfqq->bfqd->wr_busy_queues--; + BUG_ON(bfqq->bfqd->wr_busy_queues < 0); + } + bfqq->wr_coeff = 1; + bfqq->wr_cur_max_time = 0; + bfqq->last_wr_start_finish = jiffies; + /* + * Trigger a weight change on the next invocation of + * __bfq_entity_update_weight_prio. + */ + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "end_wr: wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", + bfqq->bfqd->wr_busy_queues); +} + +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + if (bfqg->async_bfqq[i][j]) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); + if (bfqg->async_idle_bfqq) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq); +} + +static void bfq_end_wr(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + + spin_lock_irq(bfqd->queue->queue_lock); + + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + bfq_end_wr_async(bfqd); + + spin_unlock_irq(bfqd->queue->queue_lock); +} + +static sector_t bfq_io_struct_pos(void *io_struct, bool request) +{ + if (request) + return blk_rq_pos(io_struct); + else + return ((struct bio *)io_struct)->bi_iter.bi_sector; +} + +static int bfq_rq_close_to_sector(void *io_struct, bool request, + sector_t sector) +{ + return abs(bfq_io_struct_pos(io_struct, request) - sector) <= + BFQQ_CLOSE_THR; +} + +static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + sector_t sector) +{ + struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + struct rb_node *parent, *node; + struct bfq_queue *__bfqq; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + /* + * First, if we find a request starting at the end of the last + * request, choose it. + */ + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); + if (__bfqq) + return __bfqq; + + /* + * If the exact sector wasn't found, the parent of the NULL leaf + * will contain the closest sector (rq_pos_tree sorted by + * next_request position). + */ + __bfqq = rb_entry(parent, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + if (blk_rq_pos(__bfqq->next_rq) < sector) + node = rb_next(&__bfqq->pos_node); + else + node = rb_prev(&__bfqq->pos_node); + if (!node) + return NULL; + + __bfqq = rb_entry(node, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + return NULL; +} + +static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq, + sector_t sector) +{ + struct bfq_queue *bfqq; + + /* + * We shall notice if some of the queues are cooperating, + * e.g., working closely on the same area of the device. In + * that case, we can group them together and: 1) don't waste + * time idling, and 2) serve the union of their requests in + * the best possible order for throughput. + */ + bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); + if (!bfqq || bfqq == cur_bfqq) + return NULL; + + return bfqq; +} + +static struct bfq_queue * +bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + int process_refs, new_process_refs; + struct bfq_queue *__bfqq; + + /* + * If there are no process references on the new_bfqq, then it is + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain + * may have dropped their last reference (not just their last process + * reference). + */ + if (!bfqq_process_refs(new_bfqq)) + return NULL; + + /* Avoid a circular list and skip interim queue merges. */ + while ((__bfqq = new_bfqq->new_bfqq)) { + if (__bfqq == bfqq) + return NULL; + new_bfqq = __bfqq; + } + + process_refs = bfqq_process_refs(bfqq); + new_process_refs = bfqq_process_refs(new_bfqq); + /* + * If the process for the bfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0 || new_process_refs == 0) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", + new_bfqq->pid); + + /* + * Merging is just a redirection: the requests of the process + * owning one of the two queues are redirected to the other queue. + * The latter queue, in its turn, is set as shared if this is the + * first time that the requests of some process are redirected to + * it. + * + * We redirect bfqq to new_bfqq and not the opposite, because we + * are in the context of the process owning bfqq, hence we have + * the io_cq of this process. So we can immediately configure this + * io_cq to redirect the requests of the process to new_bfqq. + * + * NOTE, even if new_bfqq coincides with the in-service queue, the + * io_cq of new_bfqq is not available, because, if the in-service + * queue is shared, bfqd->in_service_bic may not point to the + * io_cq of the in-service queue. + * Redirecting the requests of the process owning bfqq to the + * currently in-service queue is in any case the best option, as + * we feed the in-service queue with new requests close to the + * last request served and, by doing so, hopefully increase the + * throughput. + */ + bfqq->new_bfqq = new_bfqq; + new_bfqq->ref += process_refs; + return new_bfqq; +} + +static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + struct bfq_queue *new_bfqq) +{ + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || + (bfqq->ioprio_class != new_bfqq->ioprio_class)) + return false; + + /* + * If either of the queues has already been detected as seeky, + * then merging it with the other queue is unlikely to lead to + * sequential I/O. + */ + if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) + return false; + + /* + * Interleaved I/O is known to be done by (some) applications + * only for reads, so it does not make sense to merge async + * queues. + */ + if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) + return false; + + return true; +} + +/* + * If this function returns true, then bfqq cannot be merged. The idea + * is that true cooperation happens very early after processes start + * to do I/O. Usually, late cooperations are just accidental false + * positives. In case bfqq is weight-raised, such false positives + * would evidently degrade latency guarantees for bfqq. + */ +static bool wr_from_too_long(struct bfq_queue *bfqq) +{ + return bfqq->wr_coeff > 1 && + time_is_before_jiffies(bfqq->last_wr_start_finish + + msecs_to_jiffies(100)); +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service + * queue or with a close queue among the scheduled queues. Return + * NULL if no merge was scheduled, a pointer to the shared bfq_queue + * structure otherwise. + * + * The OOM queue is not allowed to participate to cooperation: in fact, since + * the requests temporarily redirected to the OOM queue could be redirected + * again to dedicated queues at any time, the state needed to correctly + * handle merging with the OOM queue would be quite complex and expensive + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. + * + * Weight-raised queues can be merged only if their weight-raising + * period has just started. In fact cooperating processes are usually + * started together. Thus, with this filter we avoid false positives + * that would jeopardize low-latency guarantees. + * + * WARNING: queue merging may impair fairness among non-weight raised + * queues, for at least two reasons: 1) the original weight of a + * merged queue may change during the merged state, 2) even being the + * weight the same, a merged queue may be bloated with many more + * requests than the ones produced by its originally-associated + * process. + */ +static struct bfq_queue * +bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + void *io_struct, bool request) +{ + struct bfq_queue *in_service_bfqq, *new_bfqq; + + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + + if (io_struct && wr_from_too_long(bfqq) && + likely(bfqq != &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have looked for coop, but bfq%d wr", + bfqq->pid); + + if (!io_struct || + wr_from_too_long(bfqq) || + unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + + /* If there is only one backlogged queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; + + in_service_bfqq = bfqd->in_service_queue; + + if (in_service_bfqq && in_service_bfqq != bfqq && + bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) + && likely(in_service_bfqq == &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have tried merge with in-service-queue, but wr"); + + if (!in_service_bfqq || in_service_bfqq == bfqq || + !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || + unlikely(in_service_bfqq == &bfqd->oom_bfqq)) + goto check_scheduled; + + if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + bfqq->entity.parent == in_service_bfqq->entity.parent && + bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { + new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); + if (new_bfqq) + return new_bfqq; + } + /* + * Check whether there is a cooperator among currently scheduled + * queues. The only thing we need is that the bio/request is not + * NULL, as we need it to establish whether a cooperator exists. + */ +check_scheduled: + new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + + BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); + + if (new_bfqq && wr_from_too_long(new_bfqq) && + likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have merged with bfq%d, but wr", + new_bfqq->pid); + + if (new_bfqq && !wr_from_too_long(new_bfqq) && + likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + + return NULL; +} + +static void bfq_bfqq_save_state(struct bfq_queue *bfqq) +{ + struct bfq_io_cq *bic = bfqq->bic; + + /* + * If !bfqq->bic, the queue is already shared or its requests + * have already been redirected to a shared queue; both idle window + * and weight raising state have already been saved. Do nothing. + */ + if (!bic) + return; + + bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); +} + +static void bfq_get_bic_reference(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic has a non-NULL value, the bic to which it belongs + * is about to begin using a shared bfq_queue. + */ + if (bfqq->bic) + atomic_long_inc(&bfqq->bic->icq.ioc->refcount); +} + +static void +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (unsigned long) new_bfqq->pid); + /* Save weight raising and idle window of the merged queues */ + bfq_bfqq_save_state(bfqq); + bfq_bfqq_save_state(new_bfqq); + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); + + /* + * If bfqq is weight-raised, then let new_bfqq inherit + * weight-raising. To reduce false positives, neglect the case + * where bfqq has just been created, but has not yet made it + * to be weight-raised (which may happen because EQM may merge + * bfqq even before bfq_add_request is executed for the first + * time for bfqq). Handling this case would however be very + * easy, thanks to the flag just_created. + */ + if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { + new_bfqq->wr_coeff = bfqq->wr_coeff; + new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; + new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; + new_bfqq->wr_start_at_switch_to_srt = + bfqq->wr_start_at_switch_to_srt; + if (bfq_bfqq_busy(new_bfqq)) { + bfqd->wr_busy_queues++; + BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); + } + + new_bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, new_bfqq, + "wr start after merge with %d, rais_max_time %u", + bfqq->pid, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ + bfqq->wr_coeff = 1; + bfqq->entity.prio_changed = 1; + if (bfq_bfqq_busy(bfqq)) { + bfqd->wr_busy_queues--; + BUG_ON(bfqd->wr_busy_queues < 0); + } + + } + + bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", + bfqd->wr_busy_queues); + + /* + * Grab a reference to the bic, to prevent it from being destroyed + * before being possibly touched by a bfq_split_bfqq(). + */ + bfq_get_bic_reference(bfqq); + bfq_get_bic_reference(new_bfqq); + /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ + bic_set_bfqq(bic, new_bfqq, 1); + bfq_mark_bfqq_coop(new_bfqq); + /* + * new_bfqq now belongs to at least two bics (it is a shared queue): + * set new_bfqq->bic to NULL. bfqq either: + * - does not belong to any bic any more, and hence bfqq->bic must + * be set to NULL, or + * - is a queue whose owning bics have already been redirected to a + * different queue, hence the queue is destined to not belong to + * any bic soon and bfqq->bic is already NULL (therefore the next + * assignment causes no harm). + */ + new_bfqq->bic = NULL; + bfqq->bic = NULL; + /* release process reference to bfqq */ + bfq_put_queue(bfqq); +} + +static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + bool is_sync = op_is_sync(bio->bi_opf); + struct bfq_io_cq *bic; + struct bfq_queue *bfqq, *new_bfqq; + + /* + * Disallow merge of a sync bio into an async request. + */ + if (is_sync && !rq_is_sync(rq)) + return false; + + /* + * Lookup the bfqq that this bio will be queued with. Allow + * merge only if rq is queued there. + * Queue lock is held here. + */ + bic = bfq_bic_lookup(bfqd, current->io_context); + if (!bic) + return false; + + bfqq = bic_to_bfqq(bic, is_sync); + /* + * We take advantage of this function to perform an early merge + * of the queues of possible cooperating processes. + */ + if (bfqq) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + if (new_bfqq) { + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + /* + * If we get here, the bio will be queued in the + * shared queue, i.e., new_bfqq, so use new_bfqq + * to decide whether bio and rq can be merged. + */ + bfqq = new_bfqq; + } + } + + return bfqq == RQ_BFQQ(rq); +} + +static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, + struct request *next) +{ + return RQ_BFQQ(rq) == RQ_BFQQ(next); +} + +/* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the throughput. + * In practice, a time-slice service scheme is used with seeky + * processes. + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + unsigned int timeout_coeff; + + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout * timeout_coeff; + + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); +} + +static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + if (bfqq) { + bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); + bfq_mark_bfqq_must_alloc(bfqq); + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + + BUG_ON(bfqq == bfqd->in_service_queue); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + if (time_is_before_jiffies(bfqq->last_wr_start_finish) && + bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_before_jiffies(bfqq->budget_timeout)) { + /* + * For soft real-time queues, move the start + * of the weight-raising period forward by the + * time the queue has not received any + * service. Otherwise, a relatively long + * service delay is likely to cause the + * weight-raising period of the queue to end, + * because of the short duration of the + * weight-raising period of a soft real-time + * queue. It is worth noting that this move + * is not so dangerous for the other queues, + * because soft real-time queues are not + * greedy. + * + * To not add a further variable, we use the + * overloaded field budget_timeout to + * determine for how long the queue has not + * received service, i.e., how much time has + * elapsed since the queue expired. However, + * this is a little imprecise, because + * budget_timeout is set to jiffies if bfqq + * not only expires, but also remains with no + * request. + */ + if (time_after(bfqq->budget_timeout, + bfqq->last_wr_start_finish)) + bfqq->last_wr_start_finish += + jiffies - bfqq->budget_timeout; + else + bfqq->last_wr_start_finish = jiffies; + + if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { + pr_crit( + "BFQ WARNING:last %lu budget %lu jiffies %lu", + bfqq->last_wr_start_finish, + bfqq->budget_timeout, + jiffies); + pr_crit("diff %lu", jiffies - + max_t(unsigned long, + bfqq->last_wr_start_finish, + bfqq->budget_timeout)); + bfqq->last_wr_start_finish = jiffies; + } + } + + bfq_set_budget_timeout(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, + "set_in_service_queue, cur-budget = %d", + bfqq->entity.budget); + } else + bfq_log(bfqd, "set_in_service_queue: NULL"); + + bfqd->in_service_queue = bfqq; +} + +/* + * Get and set a new queue for service. + */ +static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); + + __bfq_set_in_service_queue(bfqd, bfqq); + return bfqq; +} + +static void bfq_arm_slice_timer(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + struct bfq_io_cq *bic; + u32 sl; + + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Processes have exited, don't wait. */ + bic = bfqd->in_service_bic; + if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) + return; + + bfq_mark_bfqq_wait_request(bfqq); + + /* + * We don't want to idle for seeks, but we do want to allow + * fair distribution of slice time for a process doing back-to-back + * seeks. So allow a little bit of time for him to submit a new rq. + * + * To prevent processes with (partly) seeky workloads from + * being too ill-treated, grant them a small fraction of the + * assigned budget before reducing the waiting time to + * BFQ_MIN_TT. This happened to help reduce latency. + */ + sl = bfqd->bfq_slice_idle; + /* + * Unless the queue is being weight-raised or the scenario is + * asymmetric, grant only minimum idle time if the queue + * is seeky. A long idling is preserved for a weight-raised + * queue, or, more in general, in an asymemtric scenario, + * because a long idling is needed for guaranteeing to a queue + * its reserved share of the throughput (in particular, it is + * needed if the queue has a higher weight than some other + * queue). + */ + if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && + bfq_symmetric_scenario(bfqd)) + sl = min_t(u32, sl, BFQ_MIN_TT); + + bfqd->last_idling_start = ktime_get(); + hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), + HRTIMER_MODE_REL); + bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); + bfq_log(bfqd, "arm idle: %ld/%ld ms", + sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); +} + +/* + * In autotuning mode, max_budget is dynamically recomputed as the + * amount of sectors transferred in timeout at the estimated peak + * rate. This enables BFQ to utilize a full timeslice with a full + * budget, even if the in-service queue is served at peak rate. And + * this maximises throughput with sequential workloads. + */ +static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) +{ + return (u64)bfqd->peak_rate * USEC_PER_MSEC * + jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; +} + +/* + * Update parameters related to throughput and responsiveness, as a + * function of the estimated peak rate. See comments on + * bfq_calc_max_budget(), and on T_slow and T_fast arrays. + */ +static void update_thr_responsiveness_params(struct bfq_data *bfqd) +{ + int dev_type = blk_queue_nonrot(bfqd->queue); + + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd); + BUG_ON(bfqd->bfq_max_budget < 0); + bfq_log(bfqd, "new max_budget = %d", + bfqd->bfq_max_budget); + } + + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } + + bfq_log(bfqd, +"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", + dev_type == 0 ? "ROT" : "NONROT", + bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", + bfqd->device_speed == BFQ_BFQD_FAST ? + (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : + (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> + BFQ_RATE_SHIFT); +} + +static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) +{ + if (rq != NULL) { /* new rq dispatch now, reset accordingly */ + bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; + bfqd->peak_rate_samples = 1; + bfqd->sequential_samples = 0; + bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = + blk_rq_sectors(rq); + } else /* no new rq dispatched, just reset the number of samples */ + bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ + + bfq_log(bfqd, + "reset_rate_computation at end, sample %u/%u tot_sects %llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched); +} + +static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) +{ + u32 rate, weight, divisor; + + /* + * For the convergence property to hold (see comments on + * bfq_update_peak_rate()) and for the assessment to be + * reliable, a minimum number of samples must be present, and + * a minimum amount of time must have elapsed. If not so, do + * not compute new rate. Just reset parameters, to get ready + * for a new evaluation attempt. + */ + if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || + bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { + bfq_log(bfqd, + "update_rate_reset: only resetting, delta_first %lluus samples %d", + bfqd->delta_from_first>>10, bfqd->peak_rate_samples); + goto reset_computation; + } + + /* + * If a new request completion has occurred after last + * dispatch, then, to approximate the rate at which requests + * have been served by the device, it is more precise to + * extend the observation interval to the last completion. + */ + bfqd->delta_from_first = + max_t(u64, bfqd->delta_from_first, + bfqd->last_completion - bfqd->first_dispatch); + + BUG_ON(bfqd->delta_from_first == 0); + /* + * Rate computed in sects/usec, and not sects/nsec, for + * precision issues. + */ + rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); + + bfq_log(bfqd, +"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", + bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + rate > 20< 20M sectors/sec) + */ + if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && + rate <= bfqd->peak_rate) || + rate > 20<peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + goto reset_computation; + } else { + bfq_log(bfqd, + "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + } + + /* + * We have to update the peak rate, at last! To this purpose, + * we use a low-pass filter. We compute the smoothing constant + * of the filter as a function of the 'weight' of the new + * measured rate. + * + * As can be seen in next formulas, we define this weight as a + * quantity proportional to how sequential the workload is, + * and to how long the observation time interval is. + * + * The weight runs from 0 to 8. The maximum value of the + * weight, 8, yields the minimum value for the smoothing + * constant. At this minimum value for the smoothing constant, + * the measured rate contributes for half of the next value of + * the estimated peak rate. + * + * So, the first step is to compute the weight as a function + * of how sequential the workload is. Note that the weight + * cannot reach 9, because bfqd->sequential_samples cannot + * become equal to bfqd->peak_rate_samples, which, in its + * turn, holds true because bfqd->sequential_samples is not + * incremented for the first sample. + */ + weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; + + /* + * Second step: further refine the weight as a function of the + * duration of the observation interval. + */ + weight = min_t(u32, 8, + div_u64(weight * bfqd->delta_from_first, + BFQ_RATE_REF_INTERVAL)); + + /* + * Divisor ranging from 10, for minimum weight, to 2, for + * maximum weight. + */ + divisor = 10 - weight; + BUG_ON(divisor == 0); + + /* + * Finally, update peak rate: + * + * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor + */ + bfqd->peak_rate *= divisor-1; + bfqd->peak_rate /= divisor; + rate /= divisor; /* smoothing constant alpha = 1/divisor */ + + bfq_log(bfqd, + "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", + divisor, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), + (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); + + BUG_ON(bfqd->peak_rate == 0); + BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; + update_thr_responsiveness_params(bfqd); + BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ + bfq_log(bfqd, + "update_peak_rate: goto reset, samples %d", + bfqd->peak_rate_samples) ; + bfq_reset_rate_computation(bfqd, rq); + goto update_last_values; /* will add one sample */ + } + + /* + * Device idle for very long: the observation interval lasting + * up to this dispatch cannot be a valid observation interval + * for computing a new peak rate (similarly to the late- + * completion event in bfq_completed_request()). Go to + * update_rate_and_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - start a new observation interval with this dispatch + */ + if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && + bfqd->rq_in_driver == 0) { + bfq_log(bfqd, +"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", + (now_ns - bfqd->last_dispatch)>>10, + bfqd->peak_rate_samples) ; + goto update_rate_and_reset; + } + + /* Update sampling information */ + bfqd->peak_rate_samples++; + + if ((bfqd->rq_in_driver > 0 || + now_ns - bfqd->last_completion < BFQ_MIN_TT) + && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) + bfqd->sequential_samples++; + + bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); + + /* Reset max observed rq size every 32 dispatches */ + if (likely(bfqd->peak_rate_samples % 32)) + bfqd->last_rq_max_size = + max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); + else + bfqd->last_rq_max_size = blk_rq_sectors(rq); + + bfqd->delta_from_first = now_ns - bfqd->first_dispatch; + + bfq_log(bfqd, + "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched, + bfqd->delta_from_first>>10); + + /* Target observation interval not yet reached, go on sampling */ + if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) + goto update_last_values; + +update_rate_and_reset: + bfq_update_rate_reset(bfqd, rq); +update_last_values: + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfqd->last_dispatch = now_ns; + + bfq_log(bfqd, + "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", + (now_ns - bfqd->first_dispatch)>>10, + (unsigned long long) bfqd->last_position, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + bfq_log(bfqd, + "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); +} + +/* + * Move request from internal lists to the dispatch list of the request queue + */ +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + /* + * For consistency, the next instruction should have been executed + * after removing the request from the queue and dispatching it. + * We execute instead this instruction before bfq_remove_request() + * (and hence introduce a temporary inconsistency), for efficiency. + * In fact, in a forced_dispatch, this prevents two counters related + * to bfqq->dispatched to risk to be uselessly decremented if bfqq + * is not in service, and then to be incremented again after + * incrementing bfqq->dispatched. + */ + bfqq->dispatched++; + bfq_update_peak_rate(q->elevator->elevator_data, rq); + + bfq_remove_request(rq); + elv_dispatch_sort(q, rq); +} + +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfqq != bfqd->in_service_queue); + + /* + * If this bfqq is shared between multiple processes, check + * to make sure that those processes are still issuing I/Os + * within the mean seek distance. If not, it may be time to + * break the queues apart again. + */ + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) + bfq_mark_bfqq_split_coop(bfqq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + if (bfqq->dispatched == 0) + /* + * Overloading budget_timeout field to store + * the time at which the queue remains with no + * backlog and no outstanding request; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + + bfq_del_bfqq_busy(bfqd, bfqq, true); + } else { + bfq_requeue_bfqq(bfqd, bfqq); + /* + * Resort priority tree of potential close cooperators. + */ + bfq_pos_tree_add_move(bfqd, bfqq); + } + + /* + * All in-service entities must have been properly deactivated + * or requeued before executing the next function, which + * resets all in-service entites as no more in service. + */ + __bfq_bfqd_reset_in_service(bfqd); +} + +/** + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. + * @bfqd: device data. + * @bfqq: queue to update. + * @reason: reason for expiration. + * + * Handle the feedback on @bfqq budget at queue expiration. + * See the body for detailed comments. + */ +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + enum bfqq_expiration reason) +{ + struct request *next_rq; + int budget, min_budget; + + BUG_ON(bfqq != bfqd->in_service_queue); + + min_budget = bfq_min_budget(bfqd); + + if (bfqq->wr_coeff == 1) + budget = bfqq->max_budget; + else /* + * Use a constant, low budget for weight-raised queues, + * to help achieve a low latency. Keep it slightly higher + * than the minimum possible budget, to cause a little + * bit fewer expirations. + */ + budget = 2 * min_budget; + + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", + budget, bfq_min_budget(bfqd)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + + if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { + switch (reason) { + /* + * Caveat: in all the following cases we trade latency + * for throughput. + */ + case BFQ_BFQQ_TOO_IDLE: + /* + * This is the only case where we may reduce + * the budget: if there is no request of the + * process still waiting for completion, then + * we assume (tentatively) that the timer has + * expired because the batch of requests of + * the process could have been served with a + * smaller budget. Hence, betting that + * process will behave in the same way when it + * becomes backlogged again, we reduce its + * next budget. As long as we guess right, + * this budget cut reduces the latency + * experienced by the process. + * + * However, if there are still outstanding + * requests, then the process may have not yet + * issued its next request just because it is + * still waiting for the completion of some of + * the still outstanding ones. So in this + * subcase we do not reduce its budget, on the + * contrary we increase it to possibly boost + * the throughput, as discussed in the + * comments to the BUDGET_TIMEOUT case. + */ + if (bfqq->dispatched > 0) /* still outstanding reqs */ + budget = min(budget * 2, bfqd->bfq_max_budget); + else { + if (budget > 5 * min_budget) + budget -= 4 * min_budget; + else + budget = min_budget; + } + break; + case BFQ_BFQQ_BUDGET_TIMEOUT: + /* + * We double the budget here because it gives + * the chance to boost the throughput if this + * is not a seeky process (and has bumped into + * this timeout because of, e.g., ZBR). + */ + budget = min(budget * 2, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_BUDGET_EXHAUSTED: + /* + * The process still has backlog, and did not + * let either the budget timeout or the disk + * idling timeout expire. Hence it is not + * seeky, has a short thinktime and may be + * happy with a higher budget too. So + * definitely increase the budget of this good + * candidate to boost the disk throughput. + */ + budget = min(budget * 4, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_NO_MORE_REQUESTS: + /* + * For queues that expire for this reason, it + * is particularly important to keep the + * budget close to the actual service they + * need. Doing so reduces the timestamp + * misalignment problem described in the + * comments in the body of + * __bfq_activate_entity. In fact, suppose + * that a queue systematically expires for + * BFQ_BFQQ_NO_MORE_REQUESTS and presents a + * new request in time to enjoy timestamp + * back-shifting. The larger the budget of the + * queue is with respect to the service the + * queue actually requests in each service + * slot, the more times the queue can be + * reactivated with the same virtual finish + * time. It follows that, even if this finish + * time is pushed to the system virtual time + * to reduce the consequent timestamp + * misalignment, the queue unjustly enjoys for + * many re-activations a lower finish time + * than all newly activated queues. + * + * The service needed by bfqq is measured + * quite precisely by bfqq->entity.service. + * Since bfqq does not enjoy device idling, + * bfqq->entity.service is equal to the number + * of sectors that the process associated with + * bfqq requested to read/write before waiting + * for request completions, or blocking for + * other reasons. + */ + budget = max_t(int, bfqq->entity.service, min_budget); + break; + default: + return; + } + } else if (!bfq_bfqq_sync(bfqq)) + /* + * Async queues get always the maximum possible + * budget, as for them we do not care about latency + * (in addition, their ability to dispatch is limited + * by the charging factor). + */ + budget = bfqd->bfq_max_budget; + + bfqq->max_budget = budget; + + if (bfqd->budgets_assigned >= bfq_stats_min_budgets && + !bfqd->bfq_user_max_budget) + bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); + + /* + * If there is still backlog, then assign a new budget, making + * sure that it is large enough for the next request. Since + * the finish time of bfqq must be kept in sync with the + * budget, be sure to call __bfq_bfqq_expire() *after* this + * update. + * + * If there is no backlog, then no need to update the budget; + * it will be updated on the arrival of a new request. + */ + next_rq = bfqq->next_rq; + if (next_rq) { + BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || + reason == BFQ_BFQQ_NO_MORE_REQUESTS); + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + } + + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", + next_rq ? blk_rq_sectors(next_rq) : 0, + bfqq->entity.budget); +} + +/* + * Return true if the process associated with bfqq is "slow". The slow + * flag is used, in addition to the budget timeout, to reduce the + * amount of service provided to seeky processes, and thus reduce + * their chances to lower the throughput. More details in the comments + * on the function bfq_bfqq_expire(). + * + * An important observation is in order: as discussed in the comments + * on the function bfq_update_peak_rate(), with devices with internal + * queues, it is hard if ever possible to know when and for how long + * an I/O request is processed by the device (apart from the trivial + * I/O pattern where a new request is dispatched only after the + * previous one has been completed). This makes it hard to evaluate + * the real rate at which the I/O requests of each bfq_queue are + * served. In fact, for an I/O scheduler like BFQ, serving a + * bfq_queue means just dispatching its requests during its service + * slot (i.e., until the budget of the queue is exhausted, or the + * queue remains idle, or, finally, a timeout fires). But, during the + * service slot of a bfq_queue, around 100 ms at most, the device may + * be even still processing requests of bfq_queues served in previous + * service slots. On the opposite end, the requests of the in-service + * bfq_queue may be completed after the service slot of the queue + * finishes. + * + * Anyway, unless more sophisticated solutions are used + * (where possible), the sum of the sizes of the requests dispatched + * during the service slot of a bfq_queue is probably the only + * approximation available for the service received by the bfq_queue + * during its service slot. And this sum is the quantity used in this + * function to evaluate the I/O speed of a process. + */ +static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool compensate, enum bfqq_expiration reason, + unsigned long *delta_ms) +{ + ktime_t delta_ktime; + u32 delta_usecs; + bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ + + if (!bfq_bfqq_sync(bfqq)) + return false; + + if (compensate) + delta_ktime = bfqd->last_idling_start; + else + delta_ktime = ktime_get(); + delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); + delta_usecs = ktime_to_us(delta_ktime); + + /* don't use too short time intervals */ + if (delta_usecs < 1000) { + if (blk_queue_nonrot(bfqd->queue)) + /* + * give same worst-case guarantees as idling + * for seeky + */ + *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; + else /* charge at least one seek */ + *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; + + bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); + + return slow; + } + + *delta_ms = delta_usecs / USEC_PER_MSEC; + + /* + * Use only long (> 20ms) intervals to filter out excessive + * spikes in service rate estimation. + */ + if (delta_usecs > 20000) { + /* + * Caveat for rotational devices: processes doing I/O + * in the slower disk zones tend to be slow(er) even + * if not seeky. In this respect, the estimated peak + * rate is likely to be an average over the disk + * surface. Accordingly, to not be too harsh with + * unlucky processes, a process is deemed slow only if + * its rate has been lower than half of the estimated + * peak rate. + */ + slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; + bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", + bfqq->entity.service, bfqd->bfq_max_budget); + } + + bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); + + return slow; +} + +/* + * To be deemed as soft real-time, an application must meet two + * requirements. First, the application must not require an average + * bandwidth higher than the approximate bandwidth required to playback or + * record a compressed high-definition video. + * The next function is invoked on the completion of the last request of a + * batch, to compute the next-start time instant, soft_rt_next_start, such + * that, if the next request of the application does not arrive before + * soft_rt_next_start, then the above requirement on the bandwidth is met. + * + * The second requirement is that the request pattern of the application is + * isochronous, i.e., that, after issuing a request or a batch of requests, + * the application stops issuing new requests until all its pending requests + * have been completed. After that, the application may issue a new batch, + * and so on. + * For this reason the next function is invoked to compute + * soft_rt_next_start only for applications that meet this requirement, + * whereas soft_rt_next_start is set to infinity for applications that do + * not. + * + * Unfortunately, even a greedy application may happen to behave in an + * isochronous way if the CPU load is high. In fact, the application may + * stop issuing requests while the CPUs are busy serving other processes, + * then restart, then stop again for a while, and so on. In addition, if + * the disk achieves a low enough throughput with the request pattern + * issued by the application (e.g., because the request pattern is random + * and/or the device is slow), then the application may meet the above + * bandwidth requirement too. To prevent such a greedy application to be + * deemed as soft real-time, a further rule is used in the computation of + * soft_rt_next_start: soft_rt_next_start must be higher than the current + * time plus the maximum time for which the arrival of a request is waited + * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. + * This filters out greedy applications, as the latter issue instead their + * next request as soon as possible after the last one has been completed + * (in contrast, when a batch of requests is completed, a soft real-time + * application spends some time processing data). + * + * Unfortunately, the last filter may easily generate false positives if + * only bfqd->bfq_slice_idle is used as a reference time interval and one + * or both the following cases occur: + * 1) HZ is so low that the duration of a jiffy is comparable to or higher + * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with + * HZ=100. + * 2) jiffies, instead of increasing at a constant rate, may stop increasing + * for a while, then suddenly 'jump' by several units to recover the lost + * increments. This seems to happen, e.g., inside virtual machines. + * To address this issue, we do not use as a reference time interval just + * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In + * particular we add the minimum number of jiffies for which the filter + * seems to be quite precise also in embedded systems and KVM/QEMU virtual + * machines. + */ +static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, +"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", + bfqq->service_from_backlogged, + bfqd->bfq_wr_max_softrt_rate, + jiffies_to_msecs(HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate)); + + return max(bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); +} + +/* + * Return the farthest future time instant according to jiffies + * macros. + */ +static unsigned long bfq_greatest_from_now(void) +{ + return jiffies + MAX_JIFFY_OFFSET; +} + +/* + * Return the farthest past time instant according to jiffies + * macros. + */ +static unsigned long bfq_smallest_from_now(void) +{ + return jiffies - MAX_JIFFY_OFFSET; +} + +/** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. + * @bfqq: the queue to expire. + * @compensate: if true, compensate for the time spent idling. + * @reason: the reason causing the expiration. + * + * If the process associated with bfqq does slow I/O (e.g., because it + * issues random requests), we charge bfqq with the time it has been + * in service instead of the service it has received (see + * bfq_bfqq_charge_time for details on how this goal is achieved). As + * a consequence, bfqq will typically get higher timestamps upon + * reactivation, and hence it will be rescheduled as if it had + * received more service than what it has actually received. In the + * end, bfqq receives less service in proportion to how slowly its + * associated process consumes its budgets (and hence how seriously it + * tends to lower the throughput). In addition, this time-charging + * strategy guarantees time fairness among slow processes. In + * contrast, if the process associated with bfqq is not slow, we + * charge bfqq exactly with the service it has received. + * + * Charging time to the first type of queues and the exact service to + * the other has the effect of using the WF2Q+ policy to schedule the + * former on a timeslice basis, without violating service domain + * guarantees among the latter. + */ +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason) +{ + bool slow; + unsigned long delta = 0; + struct bfq_entity *entity = &bfqq->entity; + int ref; + + BUG_ON(bfqq != bfqd->in_service_queue); + + /* + * Check whether the process is slow (see bfq_bfqq_is_slow). + */ + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); + + /* + * Increase service_from_backlogged before next statement, + * because the possible next invocation of + * bfq_bfqq_charge_time would likely inflate + * entity->service. In contrast, service_from_backlogged must + * contain real service, to enable the soft real-time + * heuristic to correctly compute the bandwidth consumed by + * bfqq. + */ + bfqq->service_from_backlogged += entity->service; + + /* + * As above explained, charge slow (typically seeky) and + * timed-out queues with the time and not the service + * received, to favor sequential workloads. + * + * Processes doing I/O in the slower disk zones will tend to + * be slow(er) even if not seeky. Therefore, since the + * estimated peak rate is actually an average over the disk + * surface, these processes may timeout just for bad luck. To + * avoid punishing them, do not charge time to processes that + * succeeded in consuming at least 2/3 of their budget. This + * allows BFQ to preserve enough elasticity to still perform + * bandwidth, and not time, distribution with little unlucky + * or quasi-sequential processes. + */ + if (bfqq->wr_coeff == 1 && + (slow || + (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) + bfq_bfqq_charge_time(bfqd, bfqq, delta); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + if (reason == BFQ_BFQQ_TOO_IDLE && + entity->service <= 2 * entity->budget / 10) + bfq_clear_bfqq_IO_bound(bfqq); + + if (bfqd->low_latency && bfqq->wr_coeff == 1) + bfqq->last_wr_start_finish = jiffies; + + if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * If we get here, and there are no outstanding + * requests, then the request pattern is isochronous + * (see the comments on the function + * bfq_bfqq_softrt_next_start()). Thus we can compute + * soft_rt_next_start. If, instead, the queue still + * has outstanding requests, then we have to wait for + * the completion of all the outstanding requests to + * discover whether the request pattern is actually + * isochronous. + */ + BUG_ON(bfqd->busy_queues < 1); + if (bfqq->dispatched == 0) { + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", + bfqq->soft_rt_next_start); + } else { + /* + * The application is still waiting for the + * completion of one or more requests: + * prevent it from possibly being incorrectly + * deemed as soft real-time by setting its + * soft_rt_next_start to infinity. In fact, + * without this assignment, the application + * would be incorrectly deemed as soft + * real-time if: + * 1) it issued a new request before the + * completion of all its in-flight + * requests, and + * 2) at that time, its soft_rt_next_start + * happened to be in the past. + */ + bfqq->soft_rt_next_start = + bfq_greatest_from_now(); + /* + * Schedule an update of soft_rt_next_start to when + * the task may be discovered to be isochronous. + */ + bfq_mark_bfqq_softrt_update(bfqq); + } + } + + bfq_log_bfqq(bfqd, bfqq, + "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", + reason, slow, bfqq->dispatched, + bfq_bfqq_idle_window(bfqq), entity->weight); + + /* + * Increase, decrease or leave budget unchanged according to + * reason. + */ + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); + ref = bfqq->ref; + __bfq_bfqq_expire(bfqd, bfqq); + + BUG_ON(ref > 1 && + !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && + !bfq_class_idle(bfqq)); + + /* mark bfqq as waiting a request only if a bic still points to it */ + if (ref > 1 && !bfq_bfqq_busy(bfqq) && + reason != BFQ_BFQQ_BUDGET_TIMEOUT && + reason != BFQ_BFQQ_BUDGET_EXHAUSTED) + bfq_mark_bfqq_non_blocking_wait_rq(bfqq); +} + +/* + * Budget timeout is not implemented through a dedicated timer, but + * just checked on request arrivals and completions, as well as on + * idle timer expirations. + */ +static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +{ + return time_is_before_eq_jiffies(bfqq->budget_timeout); +} + +/* + * If we expire a queue that is actively waiting (i.e., with the + * device idled) for the arrival of a new request, then we may incur + * the timestamp misalignment problem described in the body of the + * function __bfq_activate_entity. Hence we return true only if this + * condition does not hold, or if the queue is slow enough to deserve + * only to be kicked off for preserving a high throughput. + */ +static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, + "may_budget_timeout: wait_request %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); + + return (!bfq_bfqq_wait_request(bfqq) || + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) + && + bfq_bfqq_budget_timeout(bfqq); +} + +/* + * For a queue that becomes empty, device idling is allowed only if + * this function returns true for that queue. As a consequence, since + * device idling plays a critical role for both throughput boosting + * and service guarantees, the return value of this function plays a + * critical role as well. + * + * In a nutshell, this function returns true only if idling is + * beneficial for throughput or, even if detrimental for throughput, + * idling is however necessary to preserve service guarantees (low + * latency, desired throughput distribution, ...). In particular, on + * NCQ-capable devices, this function tries to return false, so as to + * help keep the drives' internal queues full, whenever this helps the + * device boost the throughput without causing any service-guarantee + * issue. + * + * In more detail, the return value of this function is obtained by, + * first, computing a number of boolean variables that take into + * account throughput and service-guarantee issues, and, then, + * combining these variables in a logical expression. Most of the + * issues taken into account are not trivial. We discuss these issues + * while introducing the variables. + */ +static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + bool idling_boosts_thr, idling_boosts_thr_without_issues, + idling_needed_for_service_guarantees, + asymmetric_scenario; + + if (bfqd->strict_guarantees) + return true; + + /* + * The next variable takes into account the cases where idling + * boosts the throughput. + * + * The value of the variable is computed considering, first, that + * idling is virtually always beneficial for the throughput if: + * (a) the device is not NCQ-capable, or + * (b) regardless of the presence of NCQ, the device is rotational + * and the request pattern for bfqq is I/O-bound and sequential. + * + * Secondly, and in contrast to the above item (b), idling an + * NCQ-capable flash-based device would not boost the + * throughput even with sequential I/O; rather it would lower + * the throughput in proportion to how fast the device + * is. Accordingly, the next variable is true if any of the + * above conditions (a) and (b) is true, and, in particular, + * happens to be false if bfqd is an NCQ-capable flash-based + * device. + */ + idling_boosts_thr = !bfqd->hw_tag || + (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && + bfq_bfqq_idle_window(bfqq)); + + /* + * The value of the next variable, + * idling_boosts_thr_without_issues, is equal to that of + * idling_boosts_thr, unless a special case holds. In this + * special case, described below, idling may cause problems to + * weight-raised queues. + * + * When the request pool is saturated (e.g., in the presence + * of write hogs), if the processes associated with + * non-weight-raised queues ask for requests at a lower rate, + * then processes associated with weight-raised queues have a + * higher probability to get a request from the pool + * immediately (or at least soon) when they need one. Thus + * they have a higher probability to actually get a fraction + * of the device throughput proportional to their high + * weight. This is especially true with NCQ-capable drives, + * which enqueue several requests in advance, and further + * reorder internally-queued requests. + * + * For this reason, we force to false the value of + * idling_boosts_thr_without_issues if there are weight-raised + * busy queues. In this case, and if bfqq is not weight-raised, + * this guarantees that the device is not idled for bfqq (if, + * instead, bfqq is weight-raised, then idling will be + * guaranteed by another variable, see below). Combined with + * the timestamping rules of BFQ (see [1] for details), this + * behavior causes bfqq, and hence any sync non-weight-raised + * queue, to get a lower number of requests served, and thus + * to ask for a lower number of requests from the request + * pool, before the busy weight-raised queues get served + * again. This often mitigates starvation problems in the + * presence of heavy write workloads and NCQ, thereby + * guaranteeing a higher application and system responsiveness + * in these hostile scenarios. + */ + idling_boosts_thr_without_issues = idling_boosts_thr && + bfqd->wr_busy_queues == 0; + + /* + * There is then a case where idling must be performed not + * for throughput concerns, but to preserve service + * guarantees. + * + * To introduce this case, we can note that allowing the drive + * to enqueue more than one request at a time, and hence + * delegating de facto final scheduling decisions to the + * drive's internal scheduler, entails loss of control on the + * actual request service order. In particular, the critical + * situation is when requests from different processes happen + * to be present, at the same time, in the internal queue(s) + * of the drive. In such a situation, the drive, by deciding + * the service order of the internally-queued requests, does + * determine also the actual throughput distribution among + * these processes. But the drive typically has no notion or + * concern about per-process throughput distribution, and + * makes its decisions only on a per-request basis. Therefore, + * the service distribution enforced by the drive's internal + * scheduler is likely to coincide with the desired + * device-throughput distribution only in a completely + * symmetric scenario where: + * (i) each of these processes must get the same throughput as + * the others; + * (ii) all these processes have the same I/O pattern + * (either sequential or random). + * In fact, in such a scenario, the drive will tend to treat + * the requests of each of these processes in about the same + * way as the requests of the others, and thus to provide + * each of these processes with about the same throughput + * (which is exactly the desired throughput distribution). In + * contrast, in any asymmetric scenario, device idling is + * certainly needed to guarantee that bfqq receives its + * assigned fraction of the device throughput (see [1] for + * details). + * + * We address this issue by controlling, actually, only the + * symmetry sub-condition (i), i.e., provided that + * sub-condition (i) holds, idling is not performed, + * regardless of whether sub-condition (ii) holds. In other + * words, only if sub-condition (i) holds, then idling is + * allowed, and the device tends to be prevented from queueing + * many requests, possibly of several processes. The reason + * for not controlling also sub-condition (ii) is that we + * exploit preemption to preserve guarantees in case of + * symmetric scenarios, even if (ii) does not hold, as + * explained in the next two paragraphs. + * + * Even if a queue, say Q, is expired when it remains idle, Q + * can still preempt the new in-service queue if the next + * request of Q arrives soon (see the comments on + * bfq_bfqq_update_budg_for_activation). If all queues and + * groups have the same weight, this form of preemption, + * combined with the hole-recovery heuristic described in the + * comments on function bfq_bfqq_update_budg_for_activation, + * are enough to preserve a correct bandwidth distribution in + * the mid term, even without idling. In fact, even if not + * idling allows the internal queues of the device to contain + * many requests, and thus to reorder requests, we can rather + * safely assume that the internal scheduler still preserves a + * minimum of mid-term fairness. The motivation for using + * preemption instead of idling is that, by not idling, + * service guarantees are preserved without minimally + * sacrificing throughput. In other words, both a high + * throughput and its desired distribution are obtained. + * + * More precisely, this preemption-based, idleless approach + * provides fairness in terms of IOPS, and not sectors per + * second. This can be seen with a simple example. Suppose + * that there are two queues with the same weight, but that + * the first queue receives requests of 8 sectors, while the + * second queue receives requests of 1024 sectors. In + * addition, suppose that each of the two queues contains at + * most one request at a time, which implies that each queue + * always remains idle after it is served. Finally, after + * remaining idle, each queue receives very quickly a new + * request. It follows that the two queues are served + * alternatively, preempting each other if needed. This + * implies that, although both queues have the same weight, + * the queue with large requests receives a service that is + * 1024/8 times as high as the service received by the other + * queue. + * + * On the other hand, device idling is performed, and thus + * pure sector-domain guarantees are provided, for the + * following queues, which are likely to need stronger + * throughput guarantees: weight-raised queues, and queues + * with a higher weight than other queues. When such queues + * are active, sub-condition (i) is false, which triggers + * device idling. + * + * According to the above considerations, the next variable is + * true (only) if sub-condition (i) holds. To compute the + * value of this variable, we not only use the return value of + * the function bfq_symmetric_scenario(), but also check + * whether bfqq is being weight-raised, because + * bfq_symmetric_scenario() does not take into account also + * weight-raised queues (see comments on + * bfq_weights_tree_add()). + * + * As a side note, it is worth considering that the above + * device-idling countermeasures may however fail in the + * following unlucky scenario: if idling is (correctly) + * disabled in a time period during which all symmetry + * sub-conditions hold, and hence the device is allowed to + * enqueue many requests, but at some later point in time some + * sub-condition stops to hold, then it may become impossible + * to let requests be served in the desired order until all + * the requests already queued in the device have been served. + */ + asymmetric_scenario = bfqq->wr_coeff > 1 || + !bfq_symmetric_scenario(bfqd); + + /* + * Finally, there is a case where maximizing throughput is the + * best choice even if it may cause unfairness toward + * bfqq. Such a case is when bfqq became active in a burst of + * queue activations. Queues that became active during a large + * burst benefit only from throughput, as discussed in the + * comments on bfq_handle_burst. Thus, if bfqq became active + * in a burst and not idling the device maximizes throughput, + * then the device must no be idled, because not idling the + * device provides bfqq and all other queues in the burst with + * maximum benefit. Combining this and the above case, we can + * now establish when idling is actually needed to preserve + * service guarantees. + */ + idling_needed_for_service_guarantees = + asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); + + /* + * We have now all the components we need to compute the return + * value of the function, which is true only if both the following + * conditions hold: + * 1) bfqq is sync, because idling make sense only for sync queues; + * 2) idling either boosts the throughput (without issues), or + * is necessary to preserve service guarantees. + */ + bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", + bfq_bfqq_sync(bfqq), idling_boosts_thr); + + bfq_log_bfqq(bfqd, bfqq, + "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", + bfqd->wr_busy_queues, + idling_boosts_thr_without_issues, + bfq_bfqq_IO_bound(bfqq), + idling_needed_for_service_guarantees); + + return bfq_bfqq_sync(bfqq) && + (idling_boosts_thr_without_issues || + idling_needed_for_service_guarantees); +} + +/* + * If the in-service queue is empty but the function bfq_bfqq_may_idle + * returns true, then: + * 1) the queue must remain in service and cannot be expired, and + * 2) the device must be idled to wait for the possible arrival of a new + * request for the queue. + * See the comments on the function bfq_bfqq_may_idle for the reasons + * why performing device idling is the best choice to boost the throughput + * and preserve service guarantees when bfq_bfqq_may_idle itself + * returns true. + */ +static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && + bfq_bfqq_may_idle(bfqq); +} + +/* + * Select a queue for service. If we have a current queue in service, + * check whether to continue servicing it, or retrieve and set a new one. + */ +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + struct request *next_rq; + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; + + bfqq = bfqd->in_service_queue; + if (!bfqq) + goto new_queue; + + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + + if (bfq_may_expire_for_budg_timeout(bfqq) && + !hrtimer_active(&bfqd->idle_slice_timer) && + !bfq_bfqq_must_idle(bfqq)) + goto expire; + +check_queue: + /* + * This loop is rarely executed more than once. Even when it + * happens, it is much more convenient to re-execute this loop + * than to return NULL and trigger a new dispatch to get a + * request served. + */ + next_rq = bfqq->next_rq; + /* + * If bfqq has requests queued and it has enough budget left to + * serve them, keep the queue, otherwise expire it. + */ + if (next_rq) { + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + if (bfq_serv_to_charge(next_rq, bfqq) > + bfq_bfqq_budget_left(bfqq)) { + /* + * Expire the queue for budget exhaustion, + * which makes sure that the next budget is + * enough to serve the next request, even if + * it comes from the fifo expired path. + */ + reason = BFQ_BFQQ_BUDGET_EXHAUSTED; + goto expire; + } else { + /* + * The idle timer may be pending because we may + * not disable disk idling even when a new request + * arrives. + */ + if (bfq_bfqq_wait_request(bfqq)) { + BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); + /* + * If we get here: 1) at least a new request + * has arrived but we have not disabled the + * timer because the request was too small, + * 2) then the block layer has unplugged + * the device, causing the dispatch to be + * invoked. + * + * Since the device is unplugged, now the + * requests are probably large enough to + * provide a reasonable throughput. + * So we disable idling. + */ + bfq_clear_bfqq_wait_request(bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + } + goto keep_queue; + } + } + + /* + * No requests pending. However, if the in-service queue is idling + * for a new request, or has requests waiting for a completion and + * may idle after their completion, then keep it anyway. + */ + if (hrtimer_active(&bfqd->idle_slice_timer) || + (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { + bfqq = NULL; + goto keep_queue; + } + + reason = BFQ_BFQQ_NO_MORE_REQUESTS; +expire: + bfq_bfqq_expire(bfqd, bfqq, false, reason); +new_queue: + bfqq = bfq_set_in_service_queue(bfqd); + if (bfqq) { + bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); + goto check_queue; + } +keep_queue: + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); + else + bfq_log(bfqd, "select_queue: no queue returned"); + + return bfqq; +} + +static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_after_jiffies(bfqq->last_wr_start_finish)); + + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + + BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != + entity->orig_weight * bfqq->wr_coeff); + if (entity->prio_changed) + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); + + /* + * If the queue was activated in a burst, or too much + * time has elapsed from the beginning of this + * weight-raising period, then end weight raising. + */ + if (bfq_bfqq_in_large_burst(bfqq)) + bfq_bfqq_end_wr(bfqq); + else if (time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || + time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) + bfq_bfqq_end_wr(bfqq); + else { + /* switch back to interactive wr */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = + bfqq->wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies( + bfqq->last_wr_start_finish)); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "back to interactive wr"); + } + } + } + /* + * To improve latency (for this or other queues), immediately + * update weight both if it must be raised and if it must be + * lowered. Since, entity may be on some active tree here, and + * might have a pending change of its ioprio class, invoke + * next function with the last parameter unset (see the + * comments on the function). + */ + if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) + __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), + entity, false); +} + +/* + * Dispatch one request from bfqq, moving it to the request queue + * dispatch list. + */ +static int bfq_dispatch_request(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + int dispatched = 0; + struct request *rq = bfqq->next_rq; + unsigned long service_to_charge; + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + BUG_ON(!rq); + service_to_charge = bfq_serv_to_charge(rq, bfqq); + + BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + bfq_bfqq_served(bfqq, service_to_charge); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + bfq_dispatch_insert(bfqd->queue, rq); + + /* + * If weight raising has to terminate for bfqq, then next + * function causes an immediate update of bfqq's weight, + * without waiting for next activation. As a consequence, on + * expiration, bfqq will be timestamped as if has never been + * weight-raised during this service slot, even if it has + * received part or even most of the service as a + * weight-raised queue. This inflates bfqq's timestamps, which + * is beneficial, as bfqq is then more willing to leave the + * device immediately to possible other weight-raised queues. + */ + bfq_update_wr_data(bfqd, bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "dispatched %u sec req (%llu), budg left %d", + blk_rq_sectors(rq), + (unsigned long long) blk_rq_pos(rq), + bfq_bfqq_budget_left(bfqq)); + + dispatched++; + + if (!bfqd->in_service_bic) { + atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); + bfqd->in_service_bic = RQ_BIC(rq); + } + + if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) + goto expire; + + return dispatched; + +expire: + bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); + return dispatched; +} + +static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) +{ + int dispatched = 0; + + while (bfqq->next_rq) { + bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); + dispatched++; + } + + BUG_ON(!list_empty(&bfqq->fifo)); + return dispatched; +} + +/* + * Drain our current requests. + * Used for barriers and when switching io schedulers on-the-fly. + */ +static int bfq_forced_dispatch(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq, *n; + struct bfq_service_tree *st; + int dispatched = 0; + + bfqq = bfqd->in_service_queue; + if (bfqq) + __bfq_bfqq_expire(bfqd, bfqq); + + /* + * Loop through classes, and be careful to leave the scheduler + * in a consistent state, as feedback mechanisms and vtime + * updates cannot be disabled during the process. + */ + list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { + st = bfq_entity_service_tree(&bfqq->entity); + + dispatched += __bfq_forced_dispatch_bfqq(bfqq); + + bfqq->max_budget = bfq_max_budget(bfqd); + bfq_forget_idle(st); + } + + BUG_ON(bfqd->busy_queues != 0); + + return dispatched; +} + +static int bfq_dispatch_requests(struct request_queue *q, int force) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + + if (bfqd->busy_queues == 0) + return 0; + + if (unlikely(force)) + return bfq_forced_dispatch(bfqd); + + /* + * Force device to serve one request at a time if + * strict_guarantees is true. Forcing this service scheme is + * currently the ONLY way to guarantee that the request + * service order enforced by the scheduler is respected by a + * queueing device. Otherwise the device is free even to make + * some unlucky request wait for as long as the device + * wishes. + * + * Of course, serving one request at at time may cause loss of + * throughput. + */ + if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + return 0; + + bfqq = bfq_select_queue(bfqd); + if (!bfqq) + return 0; + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + BUG_ON(bfq_bfqq_wait_request(bfqq)); + + if (!bfq_dispatch_request(bfqd, bfqq)) + return 0; + + bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", + bfq_bfqq_sync(bfqq) ? "sync" : "async"); + + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); + return 1; +} + +/* + * Task holds one reference to the queue, dropped when task exits. Each rq + * in-flight on this queue also holds a reference, dropped when rq is freed. + * + * Queue lock must be held here. Recall not to use bfqq after calling + * this function on it. + */ +static void bfq_put_queue(struct bfq_queue *bfqq) +{ +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + struct bfq_group *bfqg = bfqq_group(bfqq); +#endif + + BUG_ON(bfqq->ref <= 0); + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); + bfqq->ref--; + if (bfqq->ref) + return; + + BUG_ON(rb_first(&bfqq->sort_list)); + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); + BUG_ON(bfqq->entity.tree); + BUG_ON(bfq_bfqq_busy(bfqq)); + + if (bfq_bfqq_sync(bfqq)) + /* + * The fact that this queue is being destroyed does not + * invalidate the fact that this queue may have been + * activated during the current burst. As a consequence, + * although the queue does not exist anymore, and hence + * needs to be removed from the burst list if there, + * the burst size has not to be decremented. + */ + hlist_del_init(&bfqq->burst_list_node); + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + bfqg_put(bfqg); +#endif +} + +static void bfq_put_cooperator(struct bfq_queue *bfqq) +{ + struct bfq_queue *__bfqq, *next; + + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { + if (__bfqq == bfqq) + break; + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; + } +} + +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + if (bfqq == bfqd->in_service_queue) { + __bfq_bfqq_expire(bfqd, bfqq); + bfq_schedule_dispatch(bfqd); + } + + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); /* release process reference */ +} + +static void bfq_init_icq(struct io_cq *icq) +{ + icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); +} + +static void bfq_exit_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + + if (bic_to_bfqq(bic, false)) { + bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); + bic_set_bfqq(bic, NULL, false); + } + + if (bic_to_bfqq(bic, true)) { + /* + * If the bic is using a shared queue, put the reference + * taken on the io_context when the bic started using a + * shared bfq_queue. + */ + if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) + put_io_context(icq->ioc); + bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); + bic_set_bfqq(bic, NULL, true); + } +} + +/* + * Update the entity prio values; note that the new values will not + * be used until the next (re)activation. + */ +static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + struct task_struct *tsk = current; + int ioprio_class; + + ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + switch (ioprio_class) { + default: + dev_err(bfqq->bfqd->queue->backing_dev_info->dev, + "bfq: bad prio class %d\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* + * No prio set, inherit CPU scheduling settings. + */ + bfqq->new_ioprio = task_nice_ioprio(tsk); + bfqq->new_ioprio_class = task_nice_ioclass(tsk); + break; + case IOPRIO_CLASS_RT: + bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio_class = IOPRIO_CLASS_RT; + break; + case IOPRIO_CLASS_BE: + bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_IDLE: + bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->new_ioprio = 7; + bfq_clear_bfqq_idle_window(bfqq); + break; + } + + if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", + bfqq->new_ioprio); + BUG(); + } + + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "set_next_ioprio_data: bic_class %d prio %d class %d", + ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); +} + +static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) +{ + struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_queue *bfqq; + unsigned long uninitialized_var(flags); + int ioprio = bic->icq.ioc->ioprio; + + /* + * This condition may trigger on a newly created bic, be sure to + * drop the lock before returning. + */ + if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) + return; + + bic->ioprio = ioprio; + + bfqq = bic_to_bfqq(bic, false); + if (bfqq) { + /* release process reference on this queue */ + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bic_set_bfqq(bic, bfqq, false); + bfq_log_bfqq(bfqd, bfqq, + "check_ioprio_change: bfqq %p %d", + bfqq, bfqq->ref); + } + + bfqq = bic_to_bfqq(bic, true); + if (bfqq) + bfq_set_next_ioprio_data(bfqq, bic); +} + +static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, pid_t pid, int is_sync) +{ + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); + INIT_HLIST_NODE(&bfqq->burst_list_node); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + + bfqq->ref = 0; + bfqq->bfqd = bfqd; + + if (bic) + bfq_set_next_ioprio_data(bfqq, bic); + + if (is_sync) { + if (!bfq_class_idle(bfqq)) + bfq_mark_bfqq_idle_window(bfqq); + bfq_mark_bfqq_sync(bfqq); + bfq_mark_bfqq_just_created(bfqq); + } else + bfq_clear_bfqq_sync(bfqq); + bfq_mark_bfqq_IO_bound(bfqq); + + /* Tentative initial value to trade off between thr and lat */ + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; + bfqq->pid = pid; + + bfqq->wr_coeff = 1; + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); + bfqq->budget_timeout = bfq_smallest_from_now(); + bfqq->split_time = bfq_smallest_from_now(); + + /* + * Set to the value for which bfqq will not be deemed as + * soft rt when it becomes backlogged. + */ + bfqq->soft_rt_next_start = bfq_greatest_from_now(); + + /* first request is almost certainly seeky */ + bfqq->seek_history = 1; +} + +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int ioprio_class, int ioprio) +{ + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + return &bfqg->async_bfqq[0][ioprio]; + case IOPRIO_CLASS_NONE: + ioprio = IOPRIO_NORM; + /* fall through */ + case IOPRIO_CLASS_BE: + return &bfqg->async_bfqq[1][ioprio]; + case IOPRIO_CLASS_IDLE: + return &bfqg->async_idle_bfqq; + default: + BUG(); + } +} + +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic) +{ + const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + struct bfq_queue **async_bfqq = NULL; + struct bfq_queue *bfqq; + struct bfq_group *bfqg; + + rcu_read_lock(); + + bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); + if (!bfqg) { + bfqq = &bfqd->oom_bfqq; + goto out; + } + + if (!is_sync) { + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, + ioprio); + bfqq = *async_bfqq; + if (bfqq) + goto out; + } + + bfqq = kmem_cache_alloc_node(bfq_pool, + GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, + bfqd->queue->node); + + if (bfqq) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + goto out; + } + + /* + * Pin the queue now that it's allocated, scheduler exit will + * prune it. + */ + if (async_bfqq) { + bfqq->ref++; /* + * Extra group reference, w.r.t. sync + * queue. This extra reference is removed + * only if bfqq->bfqg disappears, to + * guarantee that this queue is not freed + * until its group goes away. + */ + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", + bfqq, bfqq->ref); + *async_bfqq = bfqq; + } + +out: + bfqq->ref++; /* get a process reference to this queue */ + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); + rcu_read_unlock(); + return bfqq; +} + +static void bfq_update_io_thinktime(struct bfq_data *bfqd, + struct bfq_io_cq *bic) +{ + struct bfq_ttime *ttime = &bic->ttime; + u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; + + elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); + + ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); + ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, + ttime->ttime_samples); +} + +static void +bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + bfqq->seek_history <<= 1; + bfqq->seek_history |= + get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && + (!blk_queue_nonrot(bfqd->queue) || + blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); +} + +/* + * Disable idle window if the process thinks too long or seeks so much that + * it doesn't matter. + */ +static void bfq_update_idle_window(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + int enable_idle; + + /* Don't idle for async or idle io prio class. */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) + return; + + /* Idle window just restored, statistics are meaningless. */ + if (time_is_after_eq_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) + return; + + enable_idle = bfq_bfqq_idle_window(bfqq); + + if (atomic_read(&bic->icq.ioc->active_ref) == 0 || + bfqd->bfq_slice_idle == 0 || + (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && + bfqq->wr_coeff == 1)) + enable_idle = 0; + else if (bfq_sample_valid(bic->ttime.ttime_samples)) { + if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && + bfqq->wr_coeff == 1) + enable_idle = 0; + else + enable_idle = 1; + } + bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", + enable_idle); + + if (enable_idle) + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); +} + +/* + * Called when a new fs request (rq) is added to bfqq. Check if there's + * something we should do about it. + */ +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + struct bfq_io_cq *bic = RQ_BIC(rq); + + if (rq->cmd_flags & REQ_META) + bfqq->meta_pending++; + + bfq_update_io_thinktime(bfqd, bic); + bfq_update_io_seektime(bfqd, bfqq, rq); + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || + !BFQQ_SEEKY(bfqq)) + bfq_update_idle_window(bfqd, bfqq, bic); + + bfq_log_bfqq(bfqd, bfqq, + "rq_enqueued: idle_window=%d (seeky %d)", + bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + + if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { + bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && + blk_rq_sectors(rq) < 32; + bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); + + /* + * There is just this request queued: if the request + * is small and the queue is not to be expired, then + * just exit. + * + * In this way, if the device is being idled to wait + * for a new request from the in-service queue, we + * avoid unplugging the device and committing the + * device to serve just a small request. On the + * contrary, we wait for the block layer to decide + * when to unplug the device: hopefully, new requests + * will be merged to this one quickly, then the device + * will be unplugged and larger requests will be + * dispatched. + */ + if (small_req && !budget_timeout) + return; + + /* + * A large enough request arrived, or the queue is to + * be expired: in both cases disk idling is to be + * stopped, so clear wait_request flag and reset + * timer. + */ + bfq_clear_bfqq_wait_request(bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + + /* + * The queue is not empty, because a new request just + * arrived. Hence we can safely expire the queue, in + * case of budget timeout, without risking that the + * timestamps of the queue are not updated correctly. + * See [1] for more details. + */ + if (budget_timeout) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_BUDGET_TIMEOUT); + + /* + * Let the request rip immediately, or let a new queue be + * selected if bfqq has just been expired. + */ + __blk_run_queue(bfqd->queue); + } +} + +static void bfq_insert_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + + assert_spin_locked(bfqd->queue->queue_lock); + + /* + * An unplug may trigger a requeue of a request from the device + * driver: make sure we are in process context while trying to + * merge two bfq_queues. + */ + if (!in_interrupt()) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + if (new_bfqq) { + if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) + new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); + /* + * Release the request's reference to the old bfqq + * and make sure one is taken to the shared queue. + */ + new_bfqq->allocated[rq_data_dir(rq)]++; + bfqq->allocated[rq_data_dir(rq)]--; + new_bfqq->ref++; + bfq_clear_bfqq_just_created(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); + /* + * rq is about to be enqueued into new_bfqq, + * release rq reference on bfqq + */ + bfq_put_queue(bfqq); + rq->elv.priv[1] = new_bfqq; + bfqq = new_bfqq; + } + } + + bfq_add_request(rq); + + rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + list_add_tail(&rq->queuelist, &bfqq->fifo); + + bfq_rq_enqueued(bfqd, bfqq, rq); +} + +static void bfq_update_hw_tag(struct bfq_data *bfqd) +{ + bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, + bfqd->rq_in_driver); + + if (bfqd->hw_tag == 1) + return; + + /* + * This sample is valid if the number of outstanding requests + * is large enough to allow a queueing behavior. Note that the + * sum is not exact, as it's not taking into account deactivated + * requests. + */ + if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) + return; + + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) + return; + + bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; + bfqd->max_rq_in_driver = 0; + bfqd->hw_tag_samples = 0; +} + +static void bfq_completed_request(struct request_queue *q, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + u64 now_ns; + u32 delta_us; + + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", + blk_rq_sectors(rq)); + + assert_spin_locked(bfqd->queue->queue_lock); + bfq_update_hw_tag(bfqd); + + BUG_ON(!bfqd->rq_in_driver); + BUG_ON(!bfqq->dispatched); + bfqd->rq_in_driver--; + bfqq->dispatched--; + bfqg_stats_update_completion(bfqq_group(bfqq), + rq_start_time_ns(rq), + rq_io_start_time_ns(rq), + rq->cmd_flags); + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + /* + * Set budget_timeout (which we overload to store the + * time at which the queue remains with no backlog and + * no outstanding request; used by the weight-raising + * mechanism). + */ + bfqq->budget_timeout = jiffies; + + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + } + + now_ns = ktime_get_ns(); + + RQ_BIC(rq)->ttime.last_end_request = now_ns; + + /* + * Using us instead of ns, to get a reasonable precision in + * computing rate in next check. + */ + delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); + + bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, + (USEC_PER_SEC* + (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); + + /* + * If the request took rather long to complete, and, according + * to the maximum request size recorded, this completion latency + * implies that the request was certainly served at a very low + * rate (less than 1M sectors/sec), then the whole observation + * interval that lasts up to this time instant cannot be a + * valid time interval for computing a new peak rate. Invoke + * bfq_update_rate_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - reset to zero samples, which will trigger a proper + * re-initialization of the observation interval on next + * dispatch + */ + if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && + (bfqd->last_rq_max_size<last_completion = now_ns; + + /* + * If we are waiting to discover whether the request pattern + * of the task associated with the queue is actually + * isochronous, and both requisites for this condition to hold + * are now satisfied, then compute soft_rt_next_start (see the + * comments on the function bfq_bfqq_softrt_next_start()). We + * schedule this delayed check when bfqq expires, if it still + * has in-flight requests. + */ + if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + + /* + * If this is the in-service queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ + if (bfqd->in_service_queue == bfqq) { + if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { + bfq_arm_slice_timer(bfqd); + goto out; + } else if (bfq_may_expire_for_budg_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_BUDGET_TIMEOUT); + else if (RB_EMPTY_ROOT(&bfqq->sort_list) && + (bfqq->dispatched == 0 || + !bfq_bfqq_may_idle(bfqq))) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_NO_MORE_REQUESTS); + } + + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + +out: + return; +} + +static int __bfq_may_queue(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { + bfq_clear_bfqq_must_alloc(bfqq); + return ELV_MQUEUE_MUST; + } + + return ELV_MQUEUE_MAY; +} + +static int bfq_may_queue(struct request_queue *q, unsigned int op) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + /* + * Don't force setup of a queue from here, as a call to may_queue + * does not necessarily imply that a request actually will be + * queued. So just lookup a possibly existing queue, or return + * 'may queue' if that fails. + */ + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (!bic) + return ELV_MQUEUE_MAY; + + bfqq = bic_to_bfqq(bic, op_is_sync(op)); + if (bfqq) + return __bfq_may_queue(bfqq); + + return ELV_MQUEUE_MAY; +} + +/* + * Queue lock held here. + */ +static void bfq_put_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + if (bfqq) { + const int rw = rq_data_dir(rq); + + BUG_ON(!bfqq->allocated[rw]); + bfqq->allocated[rw]--; + + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", + bfqq, bfqq->ref); + bfq_put_queue(bfqq); + } +} + +/* + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this + * was the last process referring to that bfqq. + */ +static struct bfq_queue * +bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + + put_io_context(bic->icq.ioc); + + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; + bfq_clear_bfqq_coop(bfqq); + bfq_clear_bfqq_split_coop(bfqq); + return bfqq; + } + + bic_set_bfqq(bic, NULL, 1); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); + return NULL; +} + +/* + * Allocate bfq data structures associated with this request. + */ +static int bfq_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); + const int rw = rq_data_dir(rq); + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + unsigned long flags; + bool bfqq_already_existing = false, split = false; + + spin_lock_irqsave(q->queue_lock, flags); + + if (!bic) + goto queue_fail; + + bfq_check_ioprio_change(bic, bio); + + bfq_bic_update_cgroup(bic, bio); + +new_queue: + bfqq = bic_to_bfqq(bic, is_sync); + if (!bfqq || bfqq == &bfqd->oom_bfqq) { + if (bfqq) + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { + bfq_log_bfqq(bfqd, bfqq, + "set_request: was_in_list %d " + "was_in_large_burst %d " + "large burst in progress %d", + bic->was_in_burst_list, + bic->saved_in_large_burst, + bfqd->large_burst); + + if ((bic->was_in_burst_list && bfqd->large_burst) || + bic->saved_in_large_burst) { + bfq_log_bfqq(bfqd, bfqq, + "set_request: marking in " + "large burst"); + bfq_mark_bfqq_in_large_burst(bfqq); + } else { + bfq_log_bfqq(bfqd, bfqq, + "set_request: clearing in " + "large burst"); + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); + } + bfqq->split_time = jiffies; + } + } else { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + + /* Update bic before losing reference to bfqq */ + if (bfq_bfqq_in_large_burst(bfqq)) + bic->saved_in_large_burst = true; + + bfqq = bfq_split_bfqq(bic, bfqq); + split = true; + if (!bfqq) + goto new_queue; + else + bfqq_already_existing = true; + } + } + + bfqq->allocated[rw]++; + bfqq->ref++; + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; + + /* + * If a bfq_queue has only one process reference, it is owned + * by only one bfq_io_cq: we can set the bic field of the + * bfq_queue to the address of that structure. Also, if the + * queue has just been split, mark a flag so that the + * information is available to the other scheduler hooks. + */ + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; + if (split) { + /* + * If the queue has just been split from a shared + * queue, restore the idle window and the possible + * weight raising period. + */ + bfq_bfqq_resume_state(bfqq, bfqd, bic, + bfqq_already_existing); + } + } + + if (unlikely(bfq_bfqq_just_created(bfqq))) + bfq_handle_burst(bfqd, bfqq); + + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; + +queue_fail: + bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(q->queue_lock, flags); + + return 1; +} + +static void bfq_kick_queue(struct work_struct *work) +{ + struct bfq_data *bfqd = + container_of(work, struct bfq_data, unplug_work); + struct request_queue *q = bfqd->queue; + + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); +} + +/* + * Handler of the expiration of the timer running if the in-service queue + * is idling inside its time slice. + */ +static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) +{ + struct bfq_data *bfqd = container_of(timer, struct bfq_data, + idle_slice_timer); + struct bfq_queue *bfqq; + unsigned long flags; + enum bfqq_expiration reason; + + spin_lock_irqsave(bfqd->queue->queue_lock, flags); + + bfqq = bfqd->in_service_queue; + /* + * Theoretical race here: the in-service queue can be NULL or + * different from the queue that was idling if the timer handler + * spins on the queue_lock and a new request arrives for the + * current queue and there is a full dispatch cycle that changes + * the in-service queue. This can hardly happen, but in the worst + * case we just expire a queue too early. + */ + if (bfqq) { + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); + bfq_clear_bfqq_wait_request(bfqq); + + if (bfq_bfqq_budget_timeout(bfqq)) + /* + * Also here the queue can be safely expired + * for budget timeout without wasting + * guarantees + */ + reason = BFQ_BFQQ_BUDGET_TIMEOUT; + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) + /* + * The queue may not be empty upon timer expiration, + * because we may not disable the timer when the + * first request of the in-service queue arrives + * during disk idling. + */ + reason = BFQ_BFQQ_TOO_IDLE; + else + goto schedule_dispatch; + + bfq_bfqq_expire(bfqd, bfqq, true, reason); + } + +schedule_dispatch: + bfq_schedule_dispatch(bfqd); + + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); + return HRTIMER_NORESTART; +} + +static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) +{ + hrtimer_cancel(&bfqd->idle_slice_timer); + cancel_work_sync(&bfqd->unplug_work); +} + +static void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_queue **bfqq_ptr) +{ + struct bfq_group *root_group = bfqd->root_group; + struct bfq_queue *bfqq = *bfqq_ptr; + + bfq_log(bfqd, "put_async_bfqq: %p", bfqq); + if (bfqq) { + bfq_bfqq_move(bfqd, bfqq, root_group); + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", + bfqq, bfqq->ref); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; + } +} + +/* + * Release all the bfqg references to its async queues. If we are + * deallocating the group these queues may still contain requests, so + * we reparent them to the root cgroup (i.e., the only one that will + * exist for sure until all the requests on a device are gone). + */ +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); +} + +static void bfq_exit_queue(struct elevator_queue *e) +{ + struct bfq_data *bfqd = e->elevator_data; + struct request_queue *q = bfqd->queue; + struct bfq_queue *bfqq, *n; + + bfq_shutdown_timer_wq(bfqd); + + spin_lock_irq(q->queue_lock); + + BUG_ON(bfqd->in_service_queue); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, false, false); + + spin_unlock_irq(q->queue_lock); + + bfq_shutdown_timer_wq(bfqd); + + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + blkcg_deactivate_policy(q, &blkcg_policy_bfq); +#else + bfq_put_async_queues(bfqd, bfqd->root_group); + kfree(bfqd->root_group); +#endif + + kfree(bfqd); +} + +static void bfq_init_root_group(struct bfq_group *root_group, + struct bfq_data *bfqd) +{ + int i; + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + root_group->entity.parent = NULL; + root_group->my_entity = NULL; + root_group->bfqd = bfqd; +#endif + root_group->rq_pos_tree = RB_ROOT; + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + root_group->sched_data.bfq_class_idle_last_service = jiffies; +} + +static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) +{ + struct bfq_data *bfqd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; + + bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); + if (!bfqd) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = bfqd; + + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + bfqd->oom_bfqq.ref++; + bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; + bfqd->oom_bfqq.entity.new_weight = + bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); + + /* oom_bfqq does not participate to bursts */ + bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); + /* + * Trigger weight initialization, according to ioprio, at the + * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio + * class won't be changed any more. + */ + bfqd->oom_bfqq.entity.prio_changed = 1; + + bfqd->queue = q; + + spin_lock_irq(q->queue_lock); + q->elevator = eq; + spin_unlock_irq(q->queue_lock); + + bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); + if (!bfqd->root_group) + goto out_free; + bfq_init_root_group(bfqd->root_group, bfqd); + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + + hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + + bfqd->queue_weights_tree = RB_ROOT; + bfqd->group_weights_tree = RB_ROOT; + + INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); + + INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->idle_list); + INIT_HLIST_HEAD(&bfqd->burst_list); + + bfqd->hw_tag = -1; + + bfqd->bfq_max_budget = bfq_default_max_budget; + + bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; + bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; + bfqd->bfq_back_max = bfq_back_max; + bfqd->bfq_back_penalty = bfq_back_penalty; + bfqd->bfq_slice_idle = bfq_slice_idle; + bfqd->bfq_timeout = bfq_timeout; + + bfqd->bfq_requests_within_timer = 120; + + bfqd->bfq_large_burst_thresh = 8; + bfqd->bfq_burst_interval = msecs_to_jiffies(180); + + bfqd->low_latency = true; + + /* + * Trade-off between responsiveness and fairness. + */ + bfqd->bfq_wr_coeff = 30; + bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); + bfqd->bfq_wr_max_time = 0; + bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); + bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); + bfqd->bfq_wr_max_softrt_rate = 7000; /* + * Approximate rate required + * to playback or record a + * high-definition compressed + * video. + */ + bfqd->wr_busy_queues = 0; + + /* + * Begin by assuming, optimistically, that the device is a + * high-speed one, and that its peak rate is equal to 2/3 of + * the highest reference rate. + */ + bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * + T_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + bfqd->device_speed = BFQ_BFQD_FAST; + + return 0; + +out_free: + kfree(bfqd); + kobject_put(&eq->kobj); + return -ENOMEM; +} + +static void bfq_slab_kill(void) +{ + kmem_cache_destroy(bfq_pool); +} + +static int __init bfq_slab_setup(void) +{ + bfq_pool = KMEM_CACHE(bfq_queue, 0); + if (!bfq_pool) + return -ENOMEM; + return 0; +} + +static ssize_t bfq_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%u\n", var); +} + +static ssize_t bfq_var_store(unsigned long *var, const char *page, + size_t count) +{ + unsigned long new_val; + int ret = kstrtoul(page, 10, &new_val); + + if (ret == 0) + *var = new_val; + + return count; +} + +static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) +{ + struct bfq_data *bfqd = e->elevator_data; + + return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? + jiffies_to_msecs(bfqd->bfq_wr_max_time) : + jiffies_to_msecs(bfq_wr_duration(bfqd))); +} + +static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) +{ + struct bfq_queue *bfqq; + struct bfq_data *bfqd = e->elevator_data; + ssize_t num_char = 0; + + num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", + bfqd->queued); + + spin_lock_irq(bfqd->queue->queue_lock); + + num_char += sprintf(page + num_char, "Active:\n"); + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, nr_queued %d %d, ", + bfqq->pid, + bfqq->entity.weight, + bfqq->queued[0], + bfqq->queued[1]); + num_char += sprintf(page + num_char, + "dur %d/%u\n", + jiffies_to_msecs( + jiffies - + bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + num_char += sprintf(page + num_char, "Idle:\n"); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + jiffies_to_msecs(jiffies - + bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + spin_unlock_irq(bfqd->queue->queue_lock); + + return num_char; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + u64 __data = __VAR; \ + if (__CONV == 1) \ + __data = jiffies_to_msecs(__data); \ + else if (__CONV == 2) \ + __data = div_u64(__data, NSEC_PER_MSEC); \ + return bfq_var_show(__data, (page)); \ +} +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); +SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); +SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); +SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); +SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); +SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, + 1); +SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); +#undef SHOW_FUNCTION + +#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + u64 __data = __VAR; \ + __data = div_u64(__data, NSEC_PER_USEC); \ + return bfq_var_show(__data, (page)); \ +} +USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); +#undef USEC_SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t \ +__FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long uninitialized_var(__data); \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV == 1) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else if (__CONV == 2) \ + *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, + INT_MAX, 2); +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, + INT_MAX, 2); +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, + INT_MAX, 0); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); +STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); +STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, + 1); +STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, + &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, + INT_MAX, 0); +#undef STORE_FUNCTION + +#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long uninitialized_var(__data); \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + *(__PTR) = (u64)__data * NSEC_PER_USEC; \ + return ret; \ +} +USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, + UINT_MAX); +#undef USEC_STORE_FUNCTION + +/* do nothing for the moment */ +static ssize_t bfq_weights_store(struct elevator_queue *e, + const char *page, size_t count) +{ + return count; +} + +static ssize_t bfq_max_budget_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data == 0) + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); + else { + if (__data > INT_MAX) + __data = INT_MAX; + bfqd->bfq_max_budget = __data; + } + + bfqd->bfq_user_max_budget = __data; + + return ret; +} + +/* + * Leaving this name to preserve name compatibility with cfq + * parameters, but this timeout is used for both sync and async. + */ +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data < 1) + __data = 1; + else if (__data > INT_MAX) + __data = INT_MAX; + + bfqd->bfq_timeout = msecs_to_jiffies(__data); + if (bfqd->bfq_user_max_budget == 0) + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); + + return ret; +} + +static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (!bfqd->strict_guarantees && __data == 1 + && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) + bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; + + bfqd->strict_guarantees = __data; + + return ret; +} + +static ssize_t bfq_low_latency_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (__data == 0 && bfqd->low_latency != 0) + bfq_end_wr(bfqd); + bfqd->low_latency = __data; + + return ret; +} + +#define BFQ_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) + +static struct elv_fs_entry bfq_attrs[] = { + BFQ_ATTR(fifo_expire_sync), + BFQ_ATTR(fifo_expire_async), + BFQ_ATTR(back_seek_max), + BFQ_ATTR(back_seek_penalty), + BFQ_ATTR(slice_idle), + BFQ_ATTR(slice_idle_us), + BFQ_ATTR(max_budget), + BFQ_ATTR(timeout_sync), + BFQ_ATTR(strict_guarantees), + BFQ_ATTR(low_latency), + BFQ_ATTR(wr_coeff), + BFQ_ATTR(wr_max_time), + BFQ_ATTR(wr_rt_max_time), + BFQ_ATTR(wr_min_idle_time), + BFQ_ATTR(wr_min_inter_arr_async), + BFQ_ATTR(wr_max_softrt_rate), + BFQ_ATTR(weights), + __ATTR_NULL +}; + +static struct elevator_type iosched_bfq = { + .ops.sq = { + .elevator_merge_fn = bfq_merge, + .elevator_merged_fn = bfq_merged_request, + .elevator_merge_req_fn = bfq_merged_requests, +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + .elevator_bio_merged_fn = bfq_bio_merged, +#endif + .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, + .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, + .elevator_dispatch_fn = bfq_dispatch_requests, + .elevator_add_req_fn = bfq_insert_request, + .elevator_activate_req_fn = bfq_activate_request, + .elevator_deactivate_req_fn = bfq_deactivate_request, + .elevator_completed_req_fn = bfq_completed_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, + .elevator_init_icq_fn = bfq_init_icq, + .elevator_exit_icq_fn = bfq_exit_icq, + .elevator_set_req_fn = bfq_set_request, + .elevator_put_req_fn = bfq_put_request, + .elevator_may_queue_fn = bfq_may_queue, + .elevator_init_fn = bfq_init_queue, + .elevator_exit_fn = bfq_exit_queue, + }, + .icq_size = sizeof(struct bfq_io_cq), + .icq_align = __alignof__(struct bfq_io_cq), + .elevator_attrs = bfq_attrs, + .elevator_name = "bfq-sq", + .elevator_owner = THIS_MODULE, +}; + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +static struct blkcg_policy blkcg_policy_bfq = { + .dfl_cftypes = bfq_blkg_files, + .legacy_cftypes = bfq_blkcg_legacy_files, + + .cpd_alloc_fn = bfq_cpd_alloc, + .cpd_init_fn = bfq_cpd_init, + .cpd_bind_fn = bfq_cpd_init, + .cpd_free_fn = bfq_cpd_free, + + .pd_alloc_fn = bfq_pd_alloc, + .pd_init_fn = bfq_pd_init, + .pd_offline_fn = bfq_pd_offline, + .pd_free_fn = bfq_pd_free, + .pd_reset_stats_fn = bfq_pd_reset_stats, +}; +#endif + +static int __init bfq_init(void) +{ + int ret; + char msg[60] = "BFQ I/O-scheduler: v8r12"; + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + ret = blkcg_policy_register(&blkcg_policy_bfq); + if (ret) + return ret; +#endif + + ret = -ENOMEM; + if (bfq_slab_setup()) + goto err_pol_unreg; + + /* + * Times to load large popular applications for the typical + * systems installed on the reference devices (see the + * comments before the definitions of the next two + * arrays). Actually, we use slightly slower values, as the + * estimated peak rate tends to be smaller than the actual + * peak rate. The reason for this last fact is that estimates + * are computed over much shorter time intervals than the long + * intervals typically used for benchmarking. Why? First, to + * adapt more quickly to variations. Second, because an I/O + * scheduler cannot rely on a peak-rate-evaluation workload to + * be run for a long time. + */ + T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ + T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ + T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ + T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ + + /* + * Thresholds that determine the switch between speed classes + * (see the comments before the definition of the array + * device_speed_thresh). These thresholds are biased towards + * transitions to the fast class. This is safer than the + * opposite bias. In fact, a wrong transition to the slow + * class results in short weight-raising periods, because the + * speed of the device then tends to be higher that the + * reference peak rate. On the opposite end, a wrong + * transition to the fast class tends to increase + * weight-raising periods, because of the opposite reason. + */ + device_speed_thresh[0] = (4 * R_slow[0]) / 3; + device_speed_thresh[1] = (4 * R_slow[1]) / 3; + + ret = elv_register(&iosched_bfq); + if (ret) + goto err_pol_unreg; + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + strcat(msg, " (with cgroups support)"); +#endif + pr_info("%s", msg); + + return 0; + +err_pol_unreg: +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + blkcg_policy_unregister(&blkcg_policy_bfq); +#endif + return ret; +} + +static void __exit bfq_exit(void) +{ + elv_unregister(&iosched_bfq); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + blkcg_policy_unregister(&blkcg_policy_bfq); +#endif + bfq_slab_kill(); +} + +module_init(bfq_init); +module_exit(bfq_exit); + +MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); +MODULE_LICENSE("GPL"); diff --git a/block/bfq.h b/block/bfq.h new file mode 100644 index 000000000000..f5751ea59d98 --- /dev/null +++ b/block/bfq.h @@ -0,0 +1,948 @@ +/* + * BFQ v8r12 for 4.11.0: data structures and common functions prototypes. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2017 Paolo Valente + */ + +#ifndef _BFQ_H +#define _BFQ_H + +#include +#include +#include + +#define BFQ_IOPRIO_CLASSES 3 +#define BFQ_CL_IDLE_TIMEOUT (HZ/5) + +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 +#define BFQ_WEIGHT_CONVERSION_COEFF 10 + +#define BFQ_DEFAULT_QUEUE_IOPRIO 4 + +#define BFQ_WEIGHT_LEGACY_DFL 100 +#define BFQ_DEFAULT_GRP_IOPRIO 0 +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE + +/* + * Soft real-time applications are extremely more latency sensitive + * than interactive ones. Over-raise the weight of the former to + * privilege them against the latter. + */ +#define BFQ_SOFTRT_WEIGHT_FACTOR 100 + +struct bfq_entity; + +/** + * struct bfq_service_tree - per ioprio_class service tree. + * + * Each service tree represents a B-WF2Q+ scheduler on its own. Each + * ioprio_class has its own independent scheduler, and so its own + * bfq_service_tree. All the fields are protected by the queue lock + * of the containing bfqd. + */ +struct bfq_service_tree { + /* tree for active entities (i.e., those backlogged) */ + struct rb_root active; + /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ + struct rb_root idle; + + struct bfq_entity *first_idle; /* idle entity with minimum F_i */ + struct bfq_entity *last_idle; /* idle entity with maximum F_i */ + + u64 vtime; /* scheduler virtual time */ + /* scheduler weight sum; active and idle entities contribute to it */ + unsigned long wsum; +}; + +/** + * struct bfq_sched_data - multi-class scheduler. + * + * bfq_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as an + * intermediate queue on a hierarchical setup. @next_in_service + * points to the active entity of the sched_data service trees that + * will be scheduled next. It is used to reduce the number of steps + * needed for each hierarchical-schedule update. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. + * Requests from higher priority queues are served before all the + * requests from lower priority queues; among requests of the same + * queue requests are served according to B-WF2Q+. + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_sched_data { + struct bfq_entity *in_service_entity; /* entity in service */ + /* head-of-the-line entity in the scheduler (see comments above) */ + struct bfq_entity *next_in_service; + /* array of service trees, one per ioprio_class */ + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; + /* last time CLASS_IDLE was served */ + unsigned long bfq_class_idle_last_service; + +}; + +/** + * struct bfq_weight_counter - counter of the number of all active entities + * with a given weight. + */ +struct bfq_weight_counter { + unsigned int weight; /* weight of the entities this counter refers to */ + unsigned int num_active; /* nr of active entities with this weight */ + /* + * Weights tree member (see bfq_data's @queue_weights_tree and + * @group_weights_tree) + */ + struct rb_node weights_node; +}; + +/** + * struct bfq_entity - schedulable entity. + * + * A bfq_entity is used to represent either a bfq_queue (leaf node in the + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each + * entity belongs to the sched_data of the parent group in the cgroup + * hierarchy. Non-leaf entities have also their own sched_data, stored + * in @my_sched_data. + * + * Each entity stores independently its priority values; this would + * allow different weights on different devices, but this + * functionality is not exported to userspace by now. Priorities and + * weights are updated lazily, first storing the new values into the + * new_* fields, then setting the @prio_changed flag. As soon as + * there is a transition in the entity state that allows the priority + * update to take place the effective and the requested priority + * values are synchronized. + * + * Unless cgroups are used, the weight value is calculated from the + * ioprio to export the same interface as CFQ. When dealing with + * ``well-behaved'' queues (i.e., queues that do not spend too much + * time to consume their budget and have true sequential behavior, and + * when there are no external factors breaking anticipation) the + * relative weights at each level of the cgroups hierarchy should be + * guaranteed. All the fields are protected by the queue lock of the + * containing bfqd. + */ +struct bfq_entity { + struct rb_node rb_node; /* service_tree member */ + /* pointer to the weight counter associated with this entity */ + struct bfq_weight_counter *weight_counter; + + /* + * Flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree) or is in service. + */ + bool on_st; + + u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ + u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ + + /* tree the entity is enqueued into; %NULL if not on a tree */ + struct rb_root *tree; + + /* + * minimum start time of the (active) subtree rooted at this + * entity; used for O(log N) lookups into active trees + */ + u64 min_start; + + /* amount of service received during the last service slot */ + int service; + + /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ + int budget; + + unsigned int weight; /* weight of the queue */ + unsigned int new_weight; /* next weight if a change is in progress */ + + /* original weight, used to implement weight boosting */ + unsigned int orig_weight; + + /* parent entity, for hierarchical scheduling */ + struct bfq_entity *parent; + + /* + * For non-leaf nodes in the hierarchy, the associated + * scheduler queue, %NULL on leaf nodes. + */ + struct bfq_sched_data *my_sched_data; + /* the scheduler queue this entity belongs to */ + struct bfq_sched_data *sched_data; + + /* flag, set to request a weight, ioprio or ioprio_class change */ + int prio_changed; +}; + +struct bfq_group; + +/** + * struct bfq_queue - leaf schedulable entity. + * + * A bfq_queue is a leaf request queue; it can be associated with an + * io_context or more, if it is async or shared between cooperating + * processes. @cgroup holds a reference to the cgroup, to be sure that it + * does not disappear while a bfqq still references it (mostly to avoid + * races between request issuing and task migration followed by cgroup + * destruction). + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_queue { + /* reference counter */ + int ref; + /* parent bfq_data */ + struct bfq_data *bfqd; + + /* current ioprio and ioprio class */ + unsigned short ioprio, ioprio_class; + /* next ioprio and ioprio class if a change is in progress */ + unsigned short new_ioprio, new_ioprio_class; + + /* + * Shared bfq_queue if queue is cooperating with one or more + * other queues. + */ + struct bfq_queue *new_bfqq; + /* request-position tree member (see bfq_group's @rq_pos_tree) */ + struct rb_node pos_node; + /* request-position tree root (see bfq_group's @rq_pos_tree) */ + struct rb_root *pos_root; + + /* sorted list of pending requests */ + struct rb_root sort_list; + /* if fifo isn't expired, next request to serve */ + struct request *next_rq; + /* number of sync and async requests queued */ + int queued[2]; + /* number of sync and async requests currently allocated */ + int allocated[2]; + /* number of pending metadata requests */ + int meta_pending; + /* fifo list of requests in sort_list */ + struct list_head fifo; + + /* entity representing this queue in the scheduler */ + struct bfq_entity entity; + + /* maximum budget allowed from the feedback mechanism */ + int max_budget; + /* budget expiration (in jiffies) */ + unsigned long budget_timeout; + + /* number of requests on the dispatch list or inside driver */ + int dispatched; + + unsigned int flags; /* status flags.*/ + + /* node for active/idle bfqq list inside parent bfqd */ + struct list_head bfqq_list; + + /* bit vector: a 1 for each seeky requests in history */ + u32 seek_history; + + /* node for the device's burst list */ + struct hlist_node burst_list_node; + + /* position of the last request enqueued */ + sector_t last_request_pos; + + /* Number of consecutive pairs of request completion and + * arrival, such that the queue becomes idle after the + * completion, but the next request arrives within an idle + * time slice; used only if the queue's IO_bound flag has been + * cleared. + */ + unsigned int requests_within_timer; + + /* pid of the process owning the queue, used for logging purposes */ + pid_t pid; + + /* + * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL + * if the queue is shared. + */ + struct bfq_io_cq *bic; + + /* current maximum weight-raising time for this queue */ + unsigned long wr_cur_max_time; + /* + * Minimum time instant such that, only if a new request is + * enqueued after this time instant in an idle @bfq_queue with + * no outstanding requests, then the task associated with the + * queue it is deemed as soft real-time (see the comments on + * the function bfq_bfqq_softrt_next_start()) + */ + unsigned long soft_rt_next_start; + /* + * Start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period. + */ + unsigned long last_wr_start_finish; + /* factor by which the weight of this queue is multiplied */ + unsigned int wr_coeff; + /* + * Time of the last transition of the @bfq_queue from idle to + * backlogged. + */ + unsigned long last_idle_bklogged; + /* + * Cumulative service received from the @bfq_queue since the + * last transition from idle to backlogged. + */ + unsigned long service_from_backlogged; + /* + * Value of wr start time when switching to soft rt + */ + unsigned long wr_start_at_switch_to_srt; + + unsigned long split_time; /* time of last split */ +}; + +/** + * struct bfq_ttime - per process thinktime stats. + */ +struct bfq_ttime { + u64 last_end_request; /* completion time of last request */ + + u64 ttime_total; /* total process thinktime */ + unsigned long ttime_samples; /* number of thinktime samples */ + u64 ttime_mean; /* average process thinktime */ + +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + */ +struct bfq_io_cq { + /* associated io_cq structure */ + struct io_cq icq; /* must be the first member */ + /* array of two process queues, the sync and the async */ + struct bfq_queue *bfqq[2]; + /* associated @bfq_ttime struct */ + struct bfq_ttime ttime; + /* per (request_queue, blkcg) ioprio */ + int ioprio; +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + uint64_t blkcg_serial_nr; /* the current blkcg serial */ +#endif + + /* + * Snapshot of the idle window before merging; taken to + * remember this value while the queue is merged, so as to be + * able to restore it in case of split. + */ + bool saved_idle_window; + /* + * Same purpose as the previous two fields for the I/O bound + * classification of a queue. + */ + bool saved_IO_bound; + + /* + * Same purpose as the previous fields for the value of the + * field keeping the queue's belonging to a large burst + */ + bool saved_in_large_burst; + /* + * True if the queue belonged to a burst list before its merge + * with another cooperating queue. + */ + bool was_in_burst_list; + + /* + * Similar to previous fields: save wr information. + */ + unsigned long saved_wr_coeff; + unsigned long saved_last_wr_start_finish; + unsigned long saved_wr_start_at_switch_to_srt; + unsigned int saved_wr_cur_max_time; +}; + +enum bfq_device_speed { + BFQ_BFQD_FAST, + BFQ_BFQD_SLOW, +}; + +/** + * struct bfq_data - per-device data structure. + * + * All the fields are protected by the @queue lock. + */ +struct bfq_data { + /* request queue for the device */ + struct request_queue *queue; + + /* root bfq_group for the device */ + struct bfq_group *root_group; + + /* + * rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active and not + * weight-raised @bfq_queue (see the comments to the functions + * bfq_weights_tree_[add|remove] for further details). + */ + struct rb_root queue_weights_tree; + /* + * rbtree of non-queue @bfq_entity weight counters, sorted by + * weight. Used to keep track of whether all @bfq_groups have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active @bfq_group (see + * the comments to the functions bfq_weights_tree_[add|remove] + * for further details). + */ + struct rb_root group_weights_tree; + + /* + * Number of bfq_queues containing requests (including the + * queue in service, even if it is idling). + */ + int busy_queues; + /* number of weight-raised busy @bfq_queues */ + int wr_busy_queues; + /* number of queued requests */ + int queued; + /* number of requests dispatched and waiting for completion */ + int rq_in_driver; + + /* + * Maximum number of requests in driver in the last + * @hw_tag_samples completed requests. + */ + int max_rq_in_driver; + /* number of samples used to calculate hw_tag */ + int hw_tag_samples; + /* flag set to one if the driver is showing a queueing behavior */ + int hw_tag; + + /* number of budgets assigned */ + int budgets_assigned; + + /* + * Timer set when idling (waiting) for the next request from + * the queue in service. + */ + struct hrtimer idle_slice_timer; + /* delayed work to restart dispatching on the request queue */ + struct work_struct unplug_work; + + /* bfq_queue in service */ + struct bfq_queue *in_service_queue; + /* bfq_io_cq (bic) associated with the @in_service_queue */ + struct bfq_io_cq *in_service_bic; + + /* on-disk position of the last served request */ + sector_t last_position; + + /* time of last request completion (ns) */ + u64 last_completion; + + /* time of first rq dispatch in current observation interval (ns) */ + u64 first_dispatch; + /* time of last rq dispatch in current observation interval (ns) */ + u64 last_dispatch; + + /* beginning of the last budget */ + ktime_t last_budget_start; + /* beginning of the last idle slice */ + ktime_t last_idling_start; + + /* number of samples in current observation interval */ + int peak_rate_samples; + /* num of samples of seq dispatches in current observation interval */ + u32 sequential_samples; + /* total num of sectors transferred in current observation interval */ + u64 tot_sectors_dispatched; + /* max rq size seen during current observation interval (sectors) */ + u32 last_rq_max_size; + /* time elapsed from first dispatch in current observ. interval (us) */ + u64 delta_from_first; + /* current estimate of device peak rate */ + u32 peak_rate; + + /* maximum budget allotted to a bfq_queue before rescheduling */ + int bfq_max_budget; + + /* list of all the bfq_queues active on the device */ + struct list_head active_list; + /* list of all the bfq_queues idle on the device */ + struct list_head idle_list; + + /* + * Timeout for async/sync requests; when it fires, requests + * are served in fifo order. + */ + u64 bfq_fifo_expire[2]; + /* weight of backward seeks wrt forward ones */ + unsigned int bfq_back_penalty; + /* maximum allowed backward seek */ + unsigned int bfq_back_max; + /* maximum idling time */ + u32 bfq_slice_idle; + + /* user-configured max budget value (0 for auto-tuning) */ + int bfq_user_max_budget; + /* + * Timeout for bfq_queues to consume their budget; used to + * prevent seeky queues from imposing long latencies to + * sequential or quasi-sequential ones (this also implies that + * seeky queues cannot receive guarantees in the service + * domain; after a timeout they are charged for the time they + * have been in service, to preserve fairness among them, but + * without service-domain guarantees). + */ + unsigned int bfq_timeout; + + /* + * Number of consecutive requests that must be issued within + * the idle time slice to set again idling to a queue which + * was marked as non-I/O-bound (see the definition of the + * IO_bound flag for further details). + */ + unsigned int bfq_requests_within_timer; + + /* + * Force device idling whenever needed to provide accurate + * service guarantees, without caring about throughput + * issues. CAVEAT: this may even increase latencies, in case + * of useless idling for processes that did stop doing I/O. + */ + bool strict_guarantees; + + /* + * Last time at which a queue entered the current burst of + * queues being activated shortly after each other; for more + * details about this and the following parameters related to + * a burst of activations, see the comments on the function + * bfq_handle_burst. + */ + unsigned long last_ins_in_burst; + /* + * Reference time interval used to decide whether a queue has + * been activated shortly after @last_ins_in_burst. + */ + unsigned long bfq_burst_interval; + /* number of queues in the current burst of queue activations */ + int burst_size; + + /* common parent entity for the queues in the burst */ + struct bfq_entity *burst_parent_entity; + /* Maximum burst size above which the current queue-activation + * burst is deemed as 'large'. + */ + unsigned long bfq_large_burst_thresh; + /* true if a large queue-activation burst is in progress */ + bool large_burst; + /* + * Head of the burst list (as for the above fields, more + * details in the comments on the function bfq_handle_burst). + */ + struct hlist_head burst_list; + + /* if set to true, low-latency heuristics are enabled */ + bool low_latency; + /* + * Maximum factor by which the weight of a weight-raised queue + * is multiplied. + */ + unsigned int bfq_wr_coeff; + /* maximum duration of a weight-raising period (jiffies) */ + unsigned int bfq_wr_max_time; + + /* Maximum weight-raising duration for soft real-time processes */ + unsigned int bfq_wr_rt_max_time; + /* + * Minimum idle period after which weight-raising may be + * reactivated for a queue (in jiffies). + */ + unsigned int bfq_wr_min_idle_time; + /* + * Minimum period between request arrivals after which + * weight-raising may be reactivated for an already busy async + * queue (in jiffies). + */ + unsigned long bfq_wr_min_inter_arr_async; + + /* Max service-rate for a soft real-time queue, in sectors/sec */ + unsigned int bfq_wr_max_softrt_rate; + /* + * Cached value of the product R*T, used for computing the + * maximum duration of weight raising automatically. + */ + u64 RT_prod; + /* device-speed class for the low-latency heuristic */ + enum bfq_device_speed device_speed; + + /* fallback dummy bfqq for extreme OOM conditions */ + struct bfq_queue oom_bfqq; +}; + +enum bfqq_state_flags { + BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ + BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* + * waiting for a request + * without idling the device + */ + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ + BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_IO_bound, /* + * bfqq has timed-out at least once + * having consumed at most 2/10 of + * its budget + */ + BFQ_BFQQ_FLAG_in_large_burst, /* + * bfqq activated in a large burst, + * see comments to bfq_handle_burst. + */ + BFQ_BFQQ_FLAG_softrt_update, /* + * may need softrt-next-start + * update + */ + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ + BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ +}; + +#define BFQ_BFQQ_FNS(name) \ +static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ +{ \ + return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ +} + +BFQ_BFQQ_FNS(just_created); +BFQ_BFQQ_FNS(busy); +BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(non_blocking_wait_rq); +BFQ_BFQQ_FNS(must_alloc); +BFQ_BFQQ_FNS(fifo_expire); +BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(IO_bound); +BFQ_BFQQ_FNS(in_large_burst); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(softrt_update); +#undef BFQ_BFQQ_FNS + +/* Logging facilities. */ +#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE + +static const char *checked_dev_name(const struct device *dev) +{ + static const char nodev[] = "nodev"; + + if (dev) + return dev_name(dev); + + return nodev; +} + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); +static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ + assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + pr_crit("%s bfq%d%c %s " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + __pbuf, ##args); \ +} while (0) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ + pr_crit("%s %s " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + __pbuf, ##args); \ +} while (0) + +#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + pr_crit("%s bfq%d%c " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + ##args) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ + +#define bfq_log(bfqd, fmt, args...) \ + pr_crit("%s bfq " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + ##args) + +#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); +static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ + assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + __pbuf, ##args); \ +} while (0) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ +} while (0) + +#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + ##args) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ + +#define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) +#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ + +/* Expiration reasons. */ +enum bfqq_expiration { + BFQ_BFQQ_TOO_IDLE = 0, /* + * queue has been idling for + * too long + */ + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ + BFQ_BFQQ_PREEMPTED /* preemption in progress */ +}; + + +struct bfqg_stats { +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ + struct blkg_rwstat service_time; + /* total time spent waiting in scheduler queue in ns */ + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; + /* total disk time and nr sectors dispatched by this group */ + struct blkg_stat time; + /* sum of number of ios queued across all samples */ + struct blkg_stat avg_queue_size_sum; + /* count of samples taken for average */ + struct blkg_stat avg_queue_size_samples; + /* how many times this group has been removed from service tree */ + struct blkg_stat dequeue; + /* total time spent waiting for it to be assigned a timeslice. */ + struct blkg_stat group_wait_time; + /* time spent idling for this blkcg_gq */ + struct blkg_stat idle_time; + /* total time with empty current active q with other requests queued */ + struct blkg_stat empty_time; + /* fields after this shouldn't be cleared on stat reset */ + uint64_t start_group_wait_time; + uint64_t start_idle_time; + uint64_t start_empty_time; + uint16_t flags; +#endif +}; + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +/* + * struct bfq_group_data - per-blkcg storage for the blkio subsystem. + * + * @ps: @blkcg_policy_storage that this structure inherits + * @weight: weight of the bfq_group + */ +struct bfq_group_data { + /* must be the first member */ + struct blkcg_policy_data pd; + + unsigned int weight; +}; + +/** + * struct bfq_group - per (device, cgroup) data structure. + * @entity: schedulable entity to insert into the parent group sched_data. + * @sched_data: own sched_data, to contain child entities (they may be + * both bfq_queues and bfq_groups). + * @bfqd: the bfq_data for the device this group acts upon. + * @async_bfqq: array of async queues for all the tasks belonging to + * the group, one queue per ioprio value per ioprio_class, + * except for the idle class that has only one queue. + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). + * @my_entity: pointer to @entity, %NULL for the toplevel group; used + * to avoid too many special cases during group creation/ + * migration. + * @active_entities: number of active entities belonging to the group; + * unused for the root group. Used to know whether there + * are groups with more than one active @bfq_entity + * (see the comments to the function + * bfq_bfqq_may_idle()). + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_find_close_cooperator()). + * + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup + * there is a set of bfq_groups, each one collecting the lower-level + * entities belonging to the group that are acting on the same device. + * + * Locking works as follows: + * o @bfqd is protected by the queue lock, RCU is used to access it + * from the readers. + * o All the other fields are protected by the @bfqd queue lock. + */ +struct bfq_group { + /* must be the first member */ + struct blkg_policy_data pd; + + struct bfq_entity entity; + struct bfq_sched_data sched_data; + + void *bfqd; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct bfq_entity *my_entity; + + int active_entities; + + struct rb_root rq_pos_tree; + + struct bfqg_stats stats; +}; + +#else +struct bfq_group { + struct bfq_sched_data sched_data; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct rb_root rq_pos_tree; +}; +#endif + +static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); + +static unsigned int bfq_class_idx(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + return bfqq ? bfqq->ioprio_class - 1 : + BFQ_DEFAULT_GRP_CLASS - 1; +} + +static struct bfq_service_tree * +bfq_entity_service_tree(struct bfq_entity *entity) +{ + struct bfq_sched_data *sched_data = entity->sched_data; + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned int idx = bfq_class_idx(entity); + + BUG_ON(idx >= BFQ_IOPRIO_CLASSES); + BUG_ON(sched_data == NULL); + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx); + } +#endif + return sched_data->service_tree + idx; +} + +static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) +{ + return bic->bfqq[is_sync]; +} + +static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, + bool is_sync) +{ + bic->bfqq[is_sync] = bfqq; +} + +static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) +{ + return bic->icq.q->elevator->elevator_data; +} + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + +static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +{ + struct bfq_entity *group_entity = bfqq->entity.parent; + + if (!group_entity) + group_entity = &bfqq->bfqd->root_group->entity; + + return container_of(group_entity, struct bfq_group, entity); +} + +#else + +static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +{ + return bfqq->bfqd->root_group; +} + +#endif + +static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); +static void bfq_put_queue(struct bfq_queue *bfqq); +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic); +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +#endif +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); + +#endif /* _BFQ_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8da66379f7ea..bf000c58644b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -54,7 +54,7 @@ struct blk_stat_callback; * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ -#define BLKCG_MAX_POLS 3 +#define BLKCG_MAX_POLS 4 typedef void (rq_end_io_fn)(struct request *, blk_status_t); From 9916fed6c89c61a2b26053be04501784570bbec8 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 20 Jul 2017 10:46:39 +0200 Subject: [PATCH 02/51] Add extra checks related to entity scheduling - extra checks related to ioprioi-class changes - specific check on st->idle in __bfq_requeue_entity Signed-off-by: Paolo Valente --- block/bfq-sched.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index ac8991bca9fa..5ddf9af4261e 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -812,6 +812,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, } #endif + BUG_ON(entity->tree && update_class_too); BUG_ON(old_st->wsum < entity->weight); old_st->wsum -= entity->weight; @@ -883,8 +884,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, new_st->wsum += entity->weight; - if (new_st != old_st) + if (new_st != old_st) { + BUG_ON(!update_class_too); entity->start = new_st->vtime; + } } return new_st; @@ -993,6 +996,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, * tree, then it is safe to invoke next function with the last * parameter set (see the comments on the function). */ + BUG_ON(entity->tree); st = __bfq_entity_update_weight_prio(st, entity, true); bfq_calc_finish(entity, entity->budget); @@ -1113,9 +1117,11 @@ static void __bfq_activate_entity(struct bfq_entity *entity, * check for that. */ bfq_idle_extract(st, entity); + BUG_ON(entity->tree); entity->start = bfq_gt(min_vstart, entity->finish) ? min_vstart : entity->finish; } else { + BUG_ON(entity->tree); /* * The finish time of the entity may be invalid, and * it is in the past for sure, otherwise the queue @@ -1203,6 +1209,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) */ bfq_calc_finish(entity, entity->service); entity->start = entity->finish; + BUG_ON(entity->tree && entity->tree == &st->idle); BUG_ON(entity->tree && entity->tree != &st->active); /* * In addition, if the entity had more than one child From 8f5b2c25dcbe31dda524e85b921b3aa1fe11d111 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 21 Jul 2017 12:08:57 +0200 Subject: [PATCH 03/51] block, bfq: reset in_service_entity if it becomes idle BFQ implements hierarchical scheduling by representing each group of queues with a generic parent entity. For each parent entity, BFQ maintains an in_service_entity pointer: if one of the child entities happens to be in service, in_service_entity points to it. The resetting of these pointers happens only on queue expirations: when the in-service queue is expired, i.e., stops to be the queue in service, BFQ resets all in_service_entity pointers along the parent-entity path from this queue to the root entity. Functions handling the scheduling of entities assume, naturally, that in-service entities are active, i.e., have pending I/O requests (or, as a special case, even if they have no pending requests, they are expected to receive a new request very soon, with the scheduler idling the storage device while waiting for such an event). Unfortunately, the above resetting scheme of the in_service_entity pointers may cause this assumption to be violated. For example, the in-service queue may happen to remain without requests because of a request merge. In this case the queue does become idle, and all related data structures are updated accordingly. But in_service_entity still points to the queue in the parent entity. This inconsistency may even propagate to higher-level parent entities, if they happen to become idle as well, as a consequence of the leaf queue becoming idle. For this queue and parent entities, scheduling functions have an undefined behaviour, and, as reported, may easily lead to kernel crashes or hangs. This commit addresses this issue by simply resetting the in_service_entity field also when it is detected to point to an entity becoming idle (regardless of why the entity becomes idle). Reported-by: Laurentiu Nicola Signed-off-by: Paolo Valente Tested-by: Laurentiu Nicola --- block/bfq-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 5ddf9af4261e..a07a06eb5c72 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -1336,8 +1336,10 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); - if (is_in_service) + if (is_in_service) { bfq_calc_finish(entity, entity->service); + sd->in_service_entity = NULL; + } if (entity->tree == &st->active) bfq_active_extract(st, entity); From 600ea668e2d340c95724bcf981d88812d6900342 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 28 Jul 2017 21:09:51 +0200 Subject: [PATCH 04/51] block, bfq: consider also in_service_entity to state whether an entity is active Groups of BFQ queues are represented by generic entities in BFQ. When a queue belonging to a parent entity is deactivated, the parent entity may need to be deactivated too, in case the deactivated queue was the only active queue for the parent entity. This deactivation may need to be propagated upwards if the entity belongs, in its turn, to a further higher-level entity, and so on. In particular, the upward propagation of deactivation stops at the first parent entity that remains active even if one of its child entities has been deactivated. To decide whether the last non-deactivation condition holds for a parent entity, BFQ checks whether the field next_in_service is still not NULL for the parent entity, after the deactivation of one of its child entity. If it is not NULL, then there are certainly other active entities in the parent entity, and deactivations can stop. Unfortunately, this check misses a corner case: if in_service_entity is not NULL, then next_in_service may happen to be NULL, although the parent entity is evidently active. This happens if: 1) the entity pointed by in_service_entity is the only active entity in the parent entity, and 2) according to the definition of next_in_service, the in_service_entity cannot be considered as next_in_service. See the comments on the definition of next_in_service for details on this second point. Hitting the above corner case causes crashes. To address this issue, this commit: 1) Extends the above check on only next_in_service to controlling both next_in_service and in_service_entity (if any of them is not NULL, then no further deactivation is performed) 2) Improves the (important) comments on how next_in_service is defined and updated; in particular it fixes a few rather obscure paragraphs Reported-by: Eric Wheeler Reported-by: Rick Yiu Reported-by: Tom X Nguyen Signed-off-by: Paolo Valente Tested-by: Eric Wheeler Tested-by: Rick Yiu Tested-by: Laurentiu Nicola Tested-by: Tom X Nguyen --- block/bfq-sched.c | 140 ++++++++++++++++++++++++++++++------------------------ block/bfq.h | 23 +++++++-- 2 files changed, 95 insertions(+), 68 deletions(-) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index a07a06eb5c72..5c0f9290a79c 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -196,21 +196,23 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) /* * This function tells whether entity stops being a candidate for next - * service, according to the following logic. + * service, according to the restrictive definition of the field + * next_in_service. In particular, this function is invoked for an + * entity that is about to be set in service. * - * This function is invoked for an entity that is about to be set in - * service. If such an entity is a queue, then the entity is no longer - * a candidate for next service (i.e, a candidate entity to serve - * after the in-service entity is expired). The function then returns - * true. + * If entity is a queue, then the entity is no longer a candidate for + * next service according to the that definition, because entity is + * about to become the in-service queue. This function then returns + * true if entity is a queue. * - * In contrast, the entity could stil be a candidate for next service - * if it is not a queue, and has more than one child. In fact, even if - * one of its children is about to be set in service, other children - * may still be the next to serve. As a consequence, a non-queue - * entity is not a candidate for next-service only if it has only one - * child. And only if this condition holds, then the function returns - * true for a non-queue entity. + * In contrast, entity could still be a candidate for next service if + * it is not a queue, and has more than one active child. In fact, + * even if one of its children is about to be set in service, other + * active children may still be the next to serve, for the parent + * entity, even according to the above definition. As a consequence, a + * non-queue entity is not a candidate for next-service only if it has + * only one active child. And only if this condition holds, then this + * function returns true for a non-queue entity. */ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) { @@ -223,6 +225,18 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); BUG_ON(bfqg->active_entities == 0); + /* + * The field active_entities does not always contain the + * actual number of active children entities: it happens to + * not account for the in-service entity in case the latter is + * removed from its active tree (which may get done after + * invoking the function bfq_no_longer_next_in_service in + * bfq_get_next_queue). Fortunately, here, i.e., while + * bfq_no_longer_next_in_service is not yet completed in + * bfq_get_next_queue, bfq_active_extract has not yet been + * invoked, and thus active_entities still coincides with the + * actual number of active entities. + */ if (bfqg->active_entities == 1) return true; @@ -1089,7 +1103,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, * one of its children receives a new request. * * Basically, this function updates the timestamps of entity and - * inserts entity into its active tree, ater possible extracting it + * inserts entity into its active tree, ater possibly extracting it * from its idle tree. */ static void __bfq_activate_entity(struct bfq_entity *entity, @@ -1213,7 +1227,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) BUG_ON(entity->tree && entity->tree != &st->active); /* * In addition, if the entity had more than one child - * when set in service, then was not extracted from + * when set in service, then it was not extracted from * the active tree. This implies that the position of * the entity in the active tree may need to be * changed now, because we have just updated the start @@ -1221,9 +1235,8 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) * time in a moment (the requeueing is then, more * precisely, a repositioning in this case). To * implement this repositioning, we: 1) dequeue the - * entity here, 2) update the finish time and - * requeue the entity according to the new - * timestamps below. + * entity here, 2) update the finish time and requeue + * the entity according to the new timestamps below. */ if (entity->tree) bfq_active_extract(st, entity); @@ -1270,9 +1283,9 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, /** - * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, - * and activate, requeue or reposition all ancestors - * for which such an update becomes necessary. + * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, + * and activate, requeue or reposition all ancestors + * for which such an update becomes necessary. * @entity: the entity to activate. * @non_blocking_wait_rq: true if this entity was waiting for a request * @requeue: true if this is a requeue, which implies that bfqq is @@ -1308,9 +1321,9 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, * @ins_into_idle_tree: if false, the entity will not be put into the * idle tree. * - * Deactivates an entity, independently from its previous state. Must + * Deactivates an entity, independently of its previous state. Must * be invoked only if entity is on a service tree. Extracts the entity - * from that tree, and if necessary and allowed, puts it on the idle + * from that tree, and if necessary and allowed, puts it into the idle * tree. */ static bool __bfq_deactivate_entity(struct bfq_entity *entity, @@ -1359,7 +1372,7 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, /** * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. * @entity: the entity to deactivate. - * @ins_into_idle_tree: true if the entity can be put on the idle tree + * @ins_into_idle_tree: true if the entity can be put into the idle tree */ static void bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree, @@ -1406,16 +1419,29 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, */ bfq_update_next_in_service(sd, NULL); - if (sd->next_in_service) { + if (sd->next_in_service || sd->in_service_entity) { /* - * The parent entity is still backlogged, - * because next_in_service is not NULL. So, no - * further upwards deactivation must be - * performed. Yet, next_in_service has - * changed. Then the schedule does need to be - * updated upwards. + * The parent entity is still active, because + * either next_in_service or in_service_entity + * is not NULL. So, no further upwards + * deactivation must be performed. Yet, + * next_in_service has changed. Then the + * schedule does need to be updated upwards. + * + * NOTE If in_service_entity is not NULL, then + * next_in_service may happen to be NULL, + * although the parent entity is evidently + * active. This happens if 1) the entity + * pointed by in_service_entity is the only + * active entity in the parent entity, and 2) + * according to the definition of + * next_in_service, the in_service_entity + * cannot be considered as + * next_in_service. See the comments on the + * definition of next_in_service for details. */ BUG_ON(sd->next_in_service == entity); + BUG_ON(sd->in_service_entity == entity); break; } @@ -1806,45 +1832,33 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) /* * If entity is no longer a candidate for next - * service, then we extract it from its active tree, - * for the following reason. To further boost the - * throughput in some special case, BFQ needs to know - * which is the next candidate entity to serve, while - * there is already an entity in service. In this - * respect, to make it easy to compute/update the next - * candidate entity to serve after the current - * candidate has been set in service, there is a case - * where it is necessary to extract the current - * candidate from its service tree. Such a case is - * when the entity just set in service cannot be also - * a candidate for next service. Details about when - * this conditions holds are reported in the comments - * on the function bfq_no_longer_next_in_service() - * invoked below. + * service, then it must be extracted from its active + * tree, so as to make sure that it won't be + * considered when computing next_in_service. See the + * comments on the function + * bfq_no_longer_next_in_service() for details. */ if (bfq_no_longer_next_in_service(entity)) bfq_active_extract(bfq_entity_service_tree(entity), entity); /* - * For the same reason why we may have just extracted - * entity from its active tree, we may need to update - * next_in_service for the sched_data of entity too, - * regardless of whether entity has been extracted. - * In fact, even if entity has not been extracted, a - * descendant entity may get extracted. Such an event - * would cause a change in next_in_service for the - * level of the descendant entity, and thus possibly - * back to upper levels. + * Even if entity is not to be extracted according to + * the above check, a descendant entity may get + * extracted in one of the next iterations of this + * loop. Such an event could cause a change in + * next_in_service for the level of the descendant + * entity, and thus possibly back to this level. * - * We cannot perform the resulting needed update - * before the end of this loop, because, to know which - * is the correct next-to-serve candidate entity for - * each level, we need first to find the leaf entity - * to set in service. In fact, only after we know - * which is the next-to-serve leaf entity, we can - * discover whether the parent entity of the leaf - * entity becomes the next-to-serve, and so on. + * However, we cannot perform the resulting needed + * update of next_in_service for this level before the + * end of the whole loop, because, to know which is + * the correct next-to-serve candidate entity for each + * level, we need first to find the leaf entity to set + * in service. In fact, only after we know which is + * the next-to-serve leaf entity, we can discover + * whether the parent entity of the leaf entity + * becomes the next-to-serve, and so on. */ /* Log some information */ diff --git a/block/bfq.h b/block/bfq.h index f5751ea59d98..ebd9688b9f61 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -68,17 +68,30 @@ struct bfq_service_tree { * * bfq_sched_data is the basic scheduler queue. It supports three * ioprio_classes, and can be used either as a toplevel queue or as an - * intermediate queue on a hierarchical setup. @next_in_service - * points to the active entity of the sched_data service trees that - * will be scheduled next. It is used to reduce the number of steps - * needed for each hierarchical-schedule update. + * intermediate queue in a hierarchical setup. * * The supported ioprio_classes are the same as in CFQ, in descending * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. * Requests from higher priority queues are served before all the * requests from lower priority queues; among requests of the same * queue requests are served according to B-WF2Q+. - * All the fields are protected by the queue lock of the containing bfqd. + * + * The schedule is implemented by the service trees, plus the field + * @next_in_service, which points to the entity on the active trees + * that will be served next, if 1) no changes in the schedule occurs + * before the current in-service entity is expired, 2) the in-service + * queue becomes idle when it expires, and 3) if the entity pointed by + * in_service_entity is not a queue, then the in-service child entity + * of the entity pointed by in_service_entity becomes idle on + * expiration. This peculiar definition allows for the following + * optimization, not yet exploited: while a given entity is still in + * service, we already know which is the best candidate for next + * service among the other active entitities in the same parent + * entity. We can then quickly compare the timestamps of the + * in-service entity with those of such best candidate. + * + * All the fields are protected by the queue lock of the containing + * bfqd. */ struct bfq_sched_data { struct bfq_entity *in_service_entity; /* entity in service */ From 6b5effd10bc6711a862e7cbd7cd2dd0146defa01 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 4 May 2017 10:53:43 +0200 Subject: [PATCH 05/51] block, bfq: improve and refactor throughput-boosting logic When a queue associated with a process remains empty, there are cases where throughput gets boosted if the device is idled to await the arrival of a new I/O request for that queue. Currently, BFQ assumes that one of these cases is when the device has no internal queueing (regardless of the properties of the I/O being served). Unfortunately, this condition has proved to be too general. So, this commit refines it as "the device has no internal queueing and is rotational". This refinement provides a significant throughput boost with random I/O, on flash-based storage without internal queueing. For example, on a HiKey board, throughput increases by up to 125%, growing, e.g., from 6.9MB/s to 15.6MB/s with two or three random readers in parallel. This commit also refactors the code related to device idling, for the following reason. Finding the change that provides the above large improvement has been slightly more difficult than it had to be, because the logic that decides whether to idle the device is still scattered across three functions. Almost all of the logic is in the function bfq_bfqq_may_idle, but (1) part of the decision is made in bfq_update_idle_window, and (2) the function bfq_bfqq_must_idle may switch off idling regardless of the output of bfq_bfqq_may_idle. In addition, both bfq_update_idle_window and bfq_bfqq_must_idle make their decisions as a function of parameters that are used, for similar purposes, also in bfq_bfqq_may_idle. This commit addresses this issue by moving all the logic into bfq_bfqq_may_idle. Signed-off-by: Paolo Valente Signed-off-by: Luca Miccio --- block/bfq-sq-iosched.c | 141 +++++++++++++++++++++++++++---------------------- block/bfq.h | 12 ++--- 2 files changed, 83 insertions(+), 70 deletions(-) diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c index 65e7c7e77f3c..30d019fc67e0 100644 --- a/block/bfq-sq-iosched.c +++ b/block/bfq-sq-iosched.c @@ -684,10 +684,10 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, unsigned int old_wr_coeff; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); - if (bic->saved_idle_window) - bfq_mark_bfqq_idle_window(bfqq); + if (bic->saved_has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); else - bfq_clear_bfqq_idle_window(bfqq); + bfq_clear_bfqq_has_short_ttime(bfqq); if (bic->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); @@ -2047,7 +2047,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; - bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); + bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); @@ -3214,9 +3214,9 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, } bfq_log_bfqq(bfqd, bfqq, - "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", + "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", reason, slow, bfqq->dispatched, - bfq_bfqq_idle_window(bfqq), entity->weight); + bfq_bfqq_has_short_ttime(bfqq), entity->weight); /* * Increase, decrease or leave budget unchanged according to @@ -3298,7 +3298,10 @@ static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) { struct bfq_data *bfqd = bfqq->bfqd; - bool idling_boosts_thr, idling_boosts_thr_without_issues, + bool rot_without_queueing = + !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, + bfqq_sequential_and_IO_bound, + idling_boosts_thr, idling_boosts_thr_without_issues, idling_needed_for_service_guarantees, asymmetric_scenario; @@ -3306,27 +3309,44 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) return true; /* + * Idling is performed only if slice_idle > 0. In addition, we + * do not idle if + * (a) bfqq is async + * (b) bfqq is in the idle io prio class: in this case we do + * not idle because we want to minimize the bandwidth that + * queues in this class can steal to higher-priority queues + */ + if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || + bfq_class_idle(bfqq)) + return false; + + bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && + bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); + /* * The next variable takes into account the cases where idling * boosts the throughput. * * The value of the variable is computed considering, first, that * idling is virtually always beneficial for the throughput if: - * (a) the device is not NCQ-capable, or - * (b) regardless of the presence of NCQ, the device is rotational - * and the request pattern for bfqq is I/O-bound and sequential. + * (a) the device is not NCQ-capable and rotational, or + * (b) regardless of the presence of NCQ, the device is rotational and + * the request pattern for bfqq is I/O-bound and sequential, or + * (c) regardless of whether it is rotational, the device is + * not NCQ-capable and the request pattern for bfqq is + * I/O-bound and sequential. * * Secondly, and in contrast to the above item (b), idling an * NCQ-capable flash-based device would not boost the * throughput even with sequential I/O; rather it would lower * the throughput in proportion to how fast the device * is. Accordingly, the next variable is true if any of the - * above conditions (a) and (b) is true, and, in particular, - * happens to be false if bfqd is an NCQ-capable flash-based - * device. + * above conditions (a), (b) or (c) is true, and, in + * particular, happens to be false if bfqd is an NCQ-capable + * flash-based device. */ - idling_boosts_thr = !bfqd->hw_tag || - (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && - bfq_bfqq_idle_window(bfqq)); + idling_boosts_thr = rot_without_queueing || + ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && + bfqq_sequential_and_IO_bound); /* * The value of the next variable, @@ -3497,12 +3517,10 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); /* - * We have now all the components we need to compute the return - * value of the function, which is true only if both the following - * conditions hold: - * 1) bfqq is sync, because idling make sense only for sync queues; - * 2) idling either boosts the throughput (without issues), or - * is necessary to preserve service guarantees. + * We have now all the components we need to compute the + * return value of the function, which is true only if idling + * either boosts the throughput (without issues), or is + * necessary to preserve service guarantees. */ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", bfq_bfqq_sync(bfqq), idling_boosts_thr); @@ -3514,9 +3532,8 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) bfq_bfqq_IO_bound(bfqq), idling_needed_for_service_guarantees); - return bfq_bfqq_sync(bfqq) && - (idling_boosts_thr_without_issues || - idling_needed_for_service_guarantees); + return idling_boosts_thr_without_issues || + idling_needed_for_service_guarantees; } /* @@ -3532,10 +3549,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) */ static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) { - struct bfq_data *bfqd = bfqq->bfqd; - - return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && - bfq_bfqq_may_idle(bfqq); + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); } /* @@ -3994,7 +4008,6 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, case IOPRIO_CLASS_IDLE: bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; bfqq->new_ioprio = 7; - bfq_clear_bfqq_idle_window(bfqq); break; } @@ -4058,8 +4071,14 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_set_next_ioprio_data(bfqq, bic); if (is_sync) { + /* + * No need to mark as has_short_ttime if in + * idle_class, because no device idling is performed + * for queues in idle class + */ if (!bfq_class_idle(bfqq)) - bfq_mark_bfqq_idle_window(bfqq); + /* tentatively mark as has_short_ttime */ + bfq_mark_bfqq_has_short_ttime(bfqq); bfq_mark_bfqq_sync(bfqq); bfq_mark_bfqq_just_created(bfqq); } else @@ -4195,18 +4214,19 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); } -/* - * Disable idle window if the process thinks too long or seeks so much that - * it doesn't matter. - */ -static void bfq_update_idle_window(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct bfq_io_cq *bic) +static void bfq_update_has_short_ttime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) { - int enable_idle; + bool has_short_ttime = true; - /* Don't idle for async or idle io prio class. */ - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) + /* + * No need to update has_short_ttime if bfqq is async or in + * idle io prio class, or if bfq_slice_idle is zero, because + * no device idling is performed for bfqq in this case. + */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || + bfqd->bfq_slice_idle == 0) return; /* Idle window just restored, statistics are meaningless. */ @@ -4214,27 +4234,22 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, bfqd->bfq_wr_min_idle_time)) return; - enable_idle = bfq_bfqq_idle_window(bfqq); - + /* Think time is infinite if no process is linked to + * bfqq. Otherwise check average think time to + * decide whether to mark as has_short_ttime + */ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || - bfqd->bfq_slice_idle == 0 || - (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && - bfqq->wr_coeff == 1)) - enable_idle = 0; - else if (bfq_sample_valid(bic->ttime.ttime_samples)) { - if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && - bfqq->wr_coeff == 1) - enable_idle = 0; - else - enable_idle = 1; - } - bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", - enable_idle); + (bfq_sample_valid(bic->ttime.ttime_samples) && + bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) + has_short_ttime = false; + + bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", + has_short_ttime); - if (enable_idle) - bfq_mark_bfqq_idle_window(bfqq); + if (has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); else - bfq_clear_bfqq_idle_window(bfqq); + bfq_clear_bfqq_has_short_ttime(bfqq); } /* @@ -4250,14 +4265,12 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->meta_pending++; bfq_update_io_thinktime(bfqd, bic); + bfq_update_has_short_ttime(bfqd, bfqq, bic); bfq_update_io_seektime(bfqd, bfqq, rq); - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || - !BFQQ_SEEKY(bfqq)) - bfq_update_idle_window(bfqd, bfqq, bic); bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d)", - bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); + "rq_enqueued: has_short_ttime=%d (seeky %d)", + bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); diff --git a/block/bfq.h b/block/bfq.h index ebd9688b9f61..34fc4697fd89 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -349,11 +349,11 @@ struct bfq_io_cq { #endif /* - * Snapshot of the idle window before merging; taken to - * remember this value while the queue is merged, so as to be - * able to restore it in case of split. + * Snapshot of the has_short_time flag before merging; taken + * to remember its value while the queue is merged, so as to + * be able to restore it in case of split. */ - bool saved_idle_window; + bool saved_has_short_ttime; /* * Same purpose as the previous two fields for the I/O bound * classification of a queue. @@ -610,7 +610,7 @@ enum bfqq_state_flags { */ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ + BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ BFQ_BFQQ_FLAG_IO_bound, /* * bfqq has timed-out at least once @@ -649,7 +649,7 @@ BFQ_BFQQ_FNS(wait_request); BFQ_BFQQ_FNS(non_blocking_wait_rq); BFQ_BFQQ_FNS(must_alloc); BFQ_BFQQ_FNS(fifo_expire); -BFQ_BFQQ_FNS(idle_window); +BFQ_BFQQ_FNS(has_short_ttime); BFQ_BFQQ_FNS(sync); BFQ_BFQQ_FNS(IO_bound); BFQ_BFQQ_FNS(in_large_burst); From b5e746fa99d961a5642cffb27c19a77e8b638007 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 19 Dec 2016 16:59:33 +0100 Subject: [PATCH 06/51] FIRST BFQ-MQ COMMIT: Copy bfq-sq-iosched.c as bfq-mq-iosched.c This commit introduces bfq-mq-iosched.c, the main source file that will contain the code of bfq for blk-mq. I name tentatively bfq-mq this version of bfq. For the moment, the file bfq-mq-iosched.c is just a copy of bfq-sq-iosched.c, i.e, of the main source file of bfq for blk. Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 5392 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 5392 insertions(+) create mode 100644 block/bfq-mq-iosched.c diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c new file mode 100644 index 000000000000..30d019fc67e0 --- /dev/null +++ b/block/bfq-mq-iosched.c @@ -0,0 +1,5392 @@ +/* + * Budget Fair Queueing (BFQ) I/O scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2017 Paolo Valente + * + * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ + * file. + * + * BFQ is a proportional-share I/O scheduler, with some extra + * low-latency capabilities. BFQ also supports full hierarchical + * scheduling through cgroups. Next paragraphs provide an introduction + * on BFQ inner workings. Details on BFQ benefits and usage can be + * found in Documentation/block/bfq-iosched.txt. + * + * BFQ is a proportional-share storage-I/O scheduling algorithm based + * on the slice-by-slice service scheme of CFQ. But BFQ assigns + * budgets, measured in number of sectors, to processes instead of + * time slices. The device is not granted to the in-service process + * for a given time slice, but until it has exhausted its assigned + * budget. This change from the time to the service domain enables BFQ + * to distribute the device throughput among processes as desired, + * without any distortion due to throughput fluctuations, or to device + * internal queueing. BFQ uses an ad hoc internal scheduler, called + * B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated with processes. Thanks to + * the accurate policy of B-WF2Q+, BFQ can afford to assign high + * budgets to I/O-bound processes issuing sequential requests (to + * boost the throughput), and yet guarantee a low latency to + * interactive and soft real-time applications. + * + * NOTE: if the main or only goal, with a given device, is to achieve + * the maximum-possible throughput at all times, then do switch off + * all low-latency heuristics for that device, by setting low_latency + * to 0. + * + * BFQ is described in [1], where also a reference to the initial, more + * theoretical paper on BFQ can be found. The interested reader can find + * in the latter paper full details on the main algorithm, as well as + * formulas of the guarantees and formal proofs of all the properties. + * With respect to the version of BFQ presented in these papers, this + * implementation adds a few more heuristics, such as the one that + * guarantees a low latency to soft real-time applications, and a + * hierarchical extension based on H-WF2Q+. + * + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) + * complexity derives from the one introduced with EEVDF in [3]. + * + * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O + * Scheduler", Proceedings of the First Workshop on Mobile System + * Technologies (MST-2015), May 2015. + * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf + * + * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf + * + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing + * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, + * Oct 1997. + * + * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz + * + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline + * First: A Flexible and Accurate Mechanism for Proportional Share + * Resource Allocation,'' technical report. + * + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "blk.h" +#include "bfq.h" + +/* Expiration time of sync (0) and async (1) requests, in ns. */ +static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; + +/* Maximum backwards seek, in KiB. */ +static const int bfq_back_max = (16 * 1024); + +/* Penalty of a backwards seek, in number of sectors. */ +static const int bfq_back_penalty = 2; + +/* Idling period duration, in ns. */ +static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); + +/* Minimum number of assigned budgets for which stats are safe to compute. */ +static const int bfq_stats_min_budgets = 194; + +/* Default maximum budget values, in sectors and number of requests. */ +static const int bfq_default_max_budget = (16 * 1024); + +/* + * Async to sync throughput distribution is controlled as follows: + * when an async request is served, the entity is charged the number + * of sectors of the request, multiplied by the factor below + */ +static const int bfq_async_charge_factor = 10; + +/* Default timeout values, in jiffies, approximating CFQ defaults. */ +static const int bfq_timeout = (HZ / 8); + +static struct kmem_cache *bfq_pool; + +/* Below this threshold (in ns), we consider thinktime immediate. */ +#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) + +/* hw_tag detection: parallel requests threshold and min samples needed. */ +#define BFQ_HW_QUEUE_THRESHOLD 4 +#define BFQ_HW_QUEUE_SAMPLES 32 + +#define BFQQ_SEEK_THR (sector_t)(8 * 100) +#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) +#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) + +/* Min number of samples required to perform peak-rate update */ +#define BFQ_RATE_MIN_SAMPLES 32 +/* Min observation time interval required to perform a peak-rate update (ns) */ +#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) +/* Target observation time interval for a peak-rate update (ns) */ +#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC + +/* Shift used for peak rate fixed precision calculations. */ +#define BFQ_RATE_SHIFT 16 + +/* + * By default, BFQ computes the duration of the weight raising for + * interactive applications automatically, using the following formula: + * duration = (R / r) * T, where r is the peak rate of the device, and + * R and T are two reference parameters. + * In particular, R is the peak rate of the reference device (see below), + * and T is a reference time: given the systems that are likely to be + * installed on the reference device according to its speed class, T is + * about the maximum time needed, under BFQ and while reading two files in + * parallel, to load typical large applications on these systems. + * In practice, the slower/faster the device at hand is, the more/less it + * takes to load applications with respect to the reference device. + * Accordingly, the longer/shorter BFQ grants weight raising to interactive + * applications. + * + * BFQ uses four different reference pairs (R, T), depending on: + * . whether the device is rotational or non-rotational; + * . whether the device is slow, such as old or portable HDDs, as well as + * SD cards, or fast, such as newer HDDs and SSDs. + * + * The device's speed class is dynamically (re)detected in + * bfq_update_peak_rate() every time the estimated peak rate is updated. + * + * In the following definitions, R_slow[0]/R_fast[0] and + * T_slow[0]/T_fast[0] are the reference values for a slow/fast + * rotational device, whereas R_slow[1]/R_fast[1] and + * T_slow[1]/T_fast[1] are the reference values for a slow/fast + * non-rotational device. Finally, device_speed_thresh are the + * thresholds used to switch between speed classes. The reference + * rates are not the actual peak rates of the devices used as a + * reference, but slightly lower values. The reason for using these + * slightly lower values is that the peak-rate estimator tends to + * yield slightly lower values than the actual peak rate (it can yield + * the actual peak rate only if there is only one process doing I/O, + * and the process does sequential I/O). + * + * Both the reference peak rates and the thresholds are measured in + * sectors/usec, left-shifted by BFQ_RATE_SHIFT. + */ +static int R_slow[2] = {1000, 10700}; +static int R_fast[2] = {14000, 33000}; +/* + * To improve readability, a conversion function is used to initialize the + * following arrays, which entails that they can be initialized only in a + * function. + */ +static int T_slow[2]; +static int T_fast[2]; +static int device_speed_thresh[2]; + +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) +#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + +static void bfq_schedule_dispatch(struct bfq_data *bfqd); + +#include "bfq-ioc.c" +#include "bfq-sched.c" +#include "bfq-cgroup-included.c" + +#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) +#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) + +#define bfq_sample_valid(samples) ((samples) > 80) + +/* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. + */ +static void bfq_schedule_dispatch(struct bfq_data *bfqd) +{ + if (bfqd->queued != 0) { + bfq_log(bfqd, "schedule dispatch"); + kblockd_schedule_work(&bfqd->unplug_work); + } +} + +/* + * Lifted from AS - choose which of rq1 and rq2 that is best served now. + * We choose the request that is closesr to the head right now. Distance + * behind the head is penalized and only allowed to a certain extent. + */ +static struct request *bfq_choose_req(struct bfq_data *bfqd, + struct request *rq1, + struct request *rq2, + sector_t last) +{ + sector_t s1, s2, d1 = 0, d2 = 0; + unsigned long back_max; +#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ +#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ + unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ + + if (!rq1 || rq1 == rq2) + return rq2; + if (!rq2) + return rq1; + + if (rq_is_sync(rq1) && !rq_is_sync(rq2)) + return rq1; + else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) + return rq2; + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) + return rq1; + else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) + return rq2; + + s1 = blk_rq_pos(rq1); + s2 = blk_rq_pos(rq2); + + /* + * By definition, 1KiB is 2 sectors. + */ + back_max = bfqd->bfq_back_max * 2; + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. + */ + if (s1 >= last) + d1 = s1 - last; + else if (s1 + back_max >= last) + d1 = (last - s1) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ1_WRAP; + + if (s2 >= last) + d2 = s2 - last; + else if (s2 + back_max >= last) + d2 = (last - s2) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ2_WRAP; + + /* Found required data */ + + /* + * By doing switch() on the bit mask "wrap" we avoid having to + * check two variables for all permutations: --> faster! + */ + switch (wrap) { + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ + if (d1 < d2) + return rq1; + else if (d2 < d1) + return rq2; + + if (s1 >= s2) + return rq1; + else + return rq2; + + case BFQ_RQ2_WRAP: + return rq1; + case BFQ_RQ1_WRAP: + return rq2; + case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ + default: + /* + * Since both rqs are wrapped, + * start with the one that's further behind head + * (--> only *one* back seek required), + * since back seek takes more time than forward. + */ + if (s1 <= s2) + return rq1; + else + return rq2; + } +} + +static struct bfq_queue * +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, + struct rb_node ***rb_link) +{ + struct rb_node **p, *parent; + struct bfq_queue *bfqq = NULL; + + parent = NULL; + p = &root->rb_node; + while (*p) { + struct rb_node **n; + + parent = *p; + bfqq = rb_entry(parent, struct bfq_queue, pos_node); + + /* + * Sort strictly based on sector. Smallest to the left, + * largest to the right. + */ + if (sector > blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_right; + else if (sector < blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_left; + else + break; + p = n; + bfqq = NULL; + } + + *ret_parent = parent; + if (rb_link) + *rb_link = p; + + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", + (unsigned long long) sector, + bfqq ? bfqq->pid : 0); + + return bfqq; +} + +static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct rb_node **p, *parent; + struct bfq_queue *__bfqq; + + if (bfqq->pos_root) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) + return; + + bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, + blk_rq_pos(bfqq->next_rq), &parent, &p); + if (!__bfqq) { + rb_link_node(&bfqq->pos_node, parent, p); + rb_insert_color(&bfqq->pos_node, bfqq->pos_root); + } else + bfqq->pos_root = NULL; +} + +/* + * Tell whether there are active queues or groups with differentiated weights. + */ +static bool bfq_differentiated_weights(struct bfq_data *bfqd) +{ + /* + * For weights to differ, at least one of the trees must contain + * at least two nodes. + */ + return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && + (bfqd->queue_weights_tree.rb_node->rb_left || + bfqd->queue_weights_tree.rb_node->rb_right) +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + ) || + (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && + (bfqd->group_weights_tree.rb_node->rb_left || + bfqd->group_weights_tree.rb_node->rb_right) +#endif + ); +} + +/* + * The following function returns true if every queue must receive the + * same share of the throughput (this condition is used when deciding + * whether idling may be disabled, see the comments in the function + * bfq_bfqq_may_idle()). + * + * Such a scenario occurs when: + * 1) all active queues have the same weight, + * 2) all active groups at the same level in the groups tree have the same + * weight, + * 3) all active groups at the same level in the groups tree have the same + * number of children. + * + * Unfortunately, keeping the necessary state for evaluating exactly the + * above symmetry conditions would be quite complex and time-consuming. + * Therefore this function evaluates, instead, the following stronger + * sub-conditions, for which it is much easier to maintain the needed + * state: + * 1) all active queues have the same weight, + * 2) all active groups have the same weight, + * 3) all active groups have at most one active child each. + * In particular, the last two conditions are always true if hierarchical + * support and the cgroups interface are not enabled, thus no state needs + * to be maintained in this case. + */ +static bool bfq_symmetric_scenario(struct bfq_data *bfqd) +{ + return !bfq_differentiated_weights(bfqd); +} + +/* + * If the weight-counter tree passed as input contains no counter for + * the weight of the input entity, then add that counter; otherwise just + * increment the existing counter. + * + * Note that weight-counter trees contain few nodes in mostly symmetric + * scenarios. For example, if all queues have the same weight, then the + * weight-counter tree for the queues may contain at most one node. + * This holds even if low_latency is on, because weight-raised queues + * are not inserted in the tree. + * In most scenarios, the rate at which nodes are created/destroyed + * should be low too. + */ +static void bfq_weights_tree_add(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + + /* + * Do not insert if the entity is already associated with a + * counter, which happens if: + * 1) the entity is associated with a queue, + * 2) a request arrival has caused the queue to become both + * non-weight-raised, and hence change its weight, and + * backlogged; in this respect, each of the two events + * causes an invocation of this function, + * 3) this is the invocation of this function caused by the + * second event. This second invocation is actually useless, + * and we handle this fact by exiting immediately. More + * efficient or clearer solutions might possibly be adopted. + */ + if (entity->weight_counter) + return; + + while (*new) { + struct bfq_weight_counter *__counter = container_of(*new, + struct bfq_weight_counter, + weights_node); + parent = *new; + + if (entity->weight == __counter->weight) { + entity->weight_counter = __counter; + goto inc_counter; + } + if (entity->weight < __counter->weight) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), + GFP_ATOMIC); + + /* + * In the unlucky event of an allocation failure, we just + * exit. This will cause the weight of entity to not be + * considered in bfq_differentiated_weights, which, in its + * turn, causes the scenario to be deemed wrongly symmetric in + * case entity's weight would have been the only weight making + * the scenario asymmetric. On the bright side, no unbalance + * will however occur when entity becomes inactive again (the + * invocation of this function is triggered by an activation + * of entity). In fact, bfq_weights_tree_remove does nothing + * if !entity->weight_counter. + */ + if (unlikely(!entity->weight_counter)) + return; + + entity->weight_counter->weight = entity->weight; + rb_link_node(&entity->weight_counter->weights_node, parent, new); + rb_insert_color(&entity->weight_counter->weights_node, root); + +inc_counter: + entity->weight_counter->num_active++; +} + +/* + * Decrement the weight counter associated with the entity, and, if the + * counter reaches 0, remove the counter from the tree. + * See the comments to the function bfq_weights_tree_add() for considerations + * about overhead. + */ +static void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_entity *entity, + struct rb_root *root) +{ + if (!entity->weight_counter) + return; + + BUG_ON(RB_EMPTY_ROOT(root)); + BUG_ON(entity->weight_counter->weight != entity->weight); + + BUG_ON(!entity->weight_counter->num_active); + entity->weight_counter->num_active--; + if (entity->weight_counter->num_active > 0) + goto reset_entity_pointer; + + rb_erase(&entity->weight_counter->weights_node, root); + kfree(entity->weight_counter); + +reset_entity_pointer: + entity->weight_counter = NULL; +} + +/* + * Return expired entry, or NULL to just start from scratch in rbtree. + */ +static struct request *bfq_check_fifo(struct bfq_queue *bfqq, + struct request *last) +{ + struct request *rq; + + if (bfq_bfqq_fifo_expire(bfqq)) + return NULL; + + bfq_mark_bfqq_fifo_expire(bfqq); + + rq = rq_entry_fifo(bfqq->fifo.next); + + if (rq == last || ktime_get_ns() < rq->fifo_time) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); + BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); + return rq; +} + +static struct request *bfq_find_next_rq(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *last) +{ + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct request *next, *prev = NULL; + + BUG_ON(list_empty(&bfqq->fifo)); + + /* Follow expired path, else get first next available. */ + next = bfq_check_fifo(bfqq, last); + if (next) { + BUG_ON(next == last); + return next; + } + + BUG_ON(RB_EMPTY_NODE(&last->rb_node)); + + if (rbprev) + prev = rb_entry_rq(rbprev); + + if (rbnext) + next = rb_entry_rq(rbnext); + else { + rbnext = rb_first(&bfqq->sort_list); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } + + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); +} + +/* see the definition of bfq_async_charge_factor for details */ +static unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) +{ + if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) + return blk_rq_sectors(rq); + + /* + * If there are no weight-raised queues, then amplify service + * by just the async charge factor; otherwise amplify service + * by twice the async charge factor, to further reduce latency + * for weight-raised queues. + */ + if (bfqq->bfqd->wr_busy_queues == 0) + return blk_rq_sectors(rq) * bfq_async_charge_factor; + + return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; +} + +/** + * bfq_updated_next_req - update the queue after a new next_rq selection. + * @bfqd: the device data the queue belongs to. + * @bfqq: the queue to update. + * + * If the first request of a queue changes we make sure that the queue + * has enough budget to serve at least its first request (if the + * request has grown). We do this because if the queue has not enough + * budget for its first request, it has to go through two dispatch + * rounds to actually get it dispatched. + */ +static void bfq_updated_next_req(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + struct request *next_rq = bfqq->next_rq; + unsigned long new_budget; + + if (!next_rq) + return; + + if (bfqq == bfqd->in_service_queue) + /* + * In order not to break guarantees, budgets cannot be + * changed after an entity has been selected. + */ + return; + + BUG_ON(entity->tree != &st->active); + BUG_ON(entity == entity->sched_data->in_service_entity); + + new_budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + if (entity->budget != new_budget) { + entity->budget = new_budget; + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", + new_budget); + bfq_requeue_bfqq(bfqd, bfqq); + } +} + +static unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_wr_max_time > 0) + return bfqd->bfq_wr_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); + + /* + * Limit duration between 3 and 13 seconds. Tests show that + * higher values than 13 seconds often yield the opposite of + * the desired result, i.e., worsen responsiveness by letting + * non-interactive and non-soft-real-time applications + * preserve weight raising for a too long time interval. + * + * On the other end, lower values than 3 seconds make it + * difficult for most interactive tasks to complete their jobs + * before weight-raising finishes. + */ + if (dur > msecs_to_jiffies(13000)) + dur = msecs_to_jiffies(13000); + else if (dur < msecs_to_jiffies(3000)) + dur = msecs_to_jiffies(3000); + + return dur; +} + +static void +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + struct bfq_io_cq *bic, bool bfq_already_existing) +{ + unsigned int old_wr_coeff; + bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); + + if (bic->saved_has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); + else + bfq_clear_bfqq_has_short_ttime(bfqq); + + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); + + if (unlikely(busy)) + old_wr_coeff = bfqq->wr_coeff; + + bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); + bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; + bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr (%lu + %lu < %lu)", + bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, + jiffies); + + bfqq->wr_coeff = 1; + } + + /* make sure weight will be updated, however we got here */ + bfqq->entity.prio_changed = 1; + + if (likely(!busy)) + return; + + if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { + bfqd->wr_busy_queues++; + BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); + } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { + bfqd->wr_busy_queues--; + BUG_ON(bfqd->wr_busy_queues < 0); + } +} + +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + int process_refs, io_refs; + + lockdep_assert_held(bfqq->bfqd->queue->queue_lock); + + io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; + process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; + BUG_ON(process_refs < 0); + return process_refs; +} + +/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ +static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_queue *item; + struct hlist_node *n; + + hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) + hlist_del_init(&item->burst_list_node); + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); + bfqd->burst_size = 1; + bfqd->burst_parent_entity = bfqq->entity.parent; +} + +/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ +static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* Increment burst size to take into account also bfqq */ + bfqd->burst_size++; + + bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); + + BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); + + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { + struct bfq_queue *pos, *bfqq_item; + struct hlist_node *n; + + /* + * Enough queues have been activated shortly after each + * other to consider this burst as large. + */ + bfqd->large_burst = true; + bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); + + /* + * We can now mark all queues in the burst list as + * belonging to a large burst. + */ + hlist_for_each_entry(bfqq_item, &bfqd->burst_list, + burst_list_node) { + bfq_mark_bfqq_in_large_burst(bfqq_item); + bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); + } + bfq_mark_bfqq_in_large_burst(bfqq); + bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); + + /* + * From now on, and until the current burst finishes, any + * new queue being activated shortly after the last queue + * was inserted in the burst can be immediately marked as + * belonging to a large burst. So the burst list is not + * needed any more. Remove it. + */ + hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, + burst_list_node) + hlist_del_init(&pos->burst_list_node); + } else /* + * Burst not yet large: add bfqq to the burst list. Do + * not increment the ref counter for bfqq, because bfqq + * is removed from the burst list before freeing bfqq + * in put_queue. + */ + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); +} + +/* + * If many queues belonging to the same group happen to be created + * shortly after each other, then the processes associated with these + * queues have typically a common goal. In particular, bursts of queue + * creations are usually caused by services or applications that spawn + * many parallel threads/processes. Examples are systemd during boot, + * or git grep. To help these processes get their job done as soon as + * possible, it is usually better to not grant either weight-raising + * or device idling to their queues. + * + * In this comment we describe, firstly, the reasons why this fact + * holds, and, secondly, the next function, which implements the main + * steps needed to properly mark these queues so that they can then be + * treated in a different way. + * + * The above services or applications benefit mostly from a high + * throughput: the quicker the requests of the activated queues are + * cumulatively served, the sooner the target job of these queues gets + * completed. As a consequence, weight-raising any of these queues, + * which also implies idling the device for it, is almost always + * counterproductive. In most cases it just lowers throughput. + * + * On the other hand, a burst of queue creations may be caused also by + * the start of an application that does not consist of a lot of + * parallel I/O-bound threads. In fact, with a complex application, + * several short processes may need to be executed to start-up the + * application. In this respect, to start an application as quickly as + * possible, the best thing to do is in any case to privilege the I/O + * related to the application with respect to all other + * I/O. Therefore, the best strategy to start as quickly as possible + * an application that causes a burst of queue creations is to + * weight-raise all the queues created during the burst. This is the + * exact opposite of the best strategy for the other type of bursts. + * + * In the end, to take the best action for each of the two cases, the + * two types of bursts need to be distinguished. Fortunately, this + * seems relatively easy, by looking at the sizes of the bursts. In + * particular, we found a threshold such that only bursts with a + * larger size than that threshold are apparently caused by + * services or commands such as systemd or git grep. For brevity, + * hereafter we call just 'large' these bursts. BFQ *does not* + * weight-raise queues whose creation occurs in a large burst. In + * addition, for each of these queues BFQ performs or does not perform + * idling depending on which choice boosts the throughput more. The + * exact choice depends on the device and request pattern at + * hand. + * + * Unfortunately, false positives may occur while an interactive task + * is starting (e.g., an application is being started). The + * consequence is that the queues associated with the task do not + * enjoy weight raising as expected. Fortunately these false positives + * are very rare. They typically occur if some service happens to + * start doing I/O exactly when the interactive task starts. + * + * Turning back to the next function, it implements all the steps + * needed to detect the occurrence of a large burst and to properly + * mark all the queues belonging to it (so that they can then be + * treated in a different way). This goal is achieved by maintaining a + * "burst list" that holds, temporarily, the queues that belong to the + * burst in progress. The list is then used to mark these queues as + * belonging to a large burst if the burst does become large. The main + * steps are the following. + * + * . when the very first queue is created, the queue is inserted into the + * list (as it could be the first queue in a possible burst) + * + * . if the current burst has not yet become large, and a queue Q that does + * not yet belong to the burst is activated shortly after the last time + * at which a new queue entered the burst list, then the function appends + * Q to the burst list + * + * . if, as a consequence of the previous step, the burst size reaches + * the large-burst threshold, then + * + * . all the queues in the burst list are marked as belonging to a + * large burst + * + * . the burst list is deleted; in fact, the burst list already served + * its purpose (keeping temporarily track of the queues in a burst, + * so as to be able to mark them as belonging to a large burst in the + * previous sub-step), and now is not needed any more + * + * . the device enters a large-burst mode + * + * . if a queue Q that does not belong to the burst is created while + * the device is in large-burst mode and shortly after the last time + * at which a queue either entered the burst list or was marked as + * belonging to the current large burst, then Q is immediately marked + * as belonging to a large burst. + * + * . if a queue Q that does not belong to the burst is created a while + * later, i.e., not shortly after, than the last time at which a queue + * either entered the burst list or was marked as belonging to the + * current large burst, then the current burst is deemed as finished and: + * + * . the large-burst mode is reset if set + * + * . the burst list is emptied + * + * . Q is inserted in the burst list, as Q may be the first queue + * in a possible new burst (then the burst list contains just Q + * after this step). + */ +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* + * If bfqq is already in the burst list or is part of a large + * burst, or finally has just been split, then there is + * nothing else to do. + */ + if (!hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq) || + time_is_after_eq_jiffies(bfqq->split_time + + msecs_to_jiffies(10))) + return; + + /* + * If bfqq's creation happens late enough, or bfqq belongs to + * a different group than the burst group, then the current + * burst is finished, and related data structures must be + * reset. + * + * In this respect, consider the special case where bfqq is + * the very first queue created after BFQ is selected for this + * device. In this case, last_ins_in_burst and + * burst_parent_entity are not yet significant when we get + * here. But it is easy to verify that, whether or not the + * following condition is true, bfqq will end up being + * inserted into the burst list. In particular the list will + * happen to contain only bfqq. And this is exactly what has + * to happen, as bfqq may be the first queue of the first + * burst. + */ + if (time_is_before_jiffies(bfqd->last_ins_in_burst + + bfqd->bfq_burst_interval) || + bfqq->entity.parent != bfqd->burst_parent_entity) { + bfqd->large_burst = false; + bfq_reset_burst_list(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, + "handle_burst: late activation or different group"); + goto end; + } + + /* + * If we get here, then bfqq is being activated shortly after the + * last queue. So, if the current burst is also large, we can mark + * bfqq as belonging to this large burst immediately. + */ + if (bfqd->large_burst) { + bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); + bfq_mark_bfqq_in_large_burst(bfqq); + goto end; + } + + /* + * If we get here, then a large-burst state has not yet been + * reached, but bfqq is being activated shortly after the last + * queue. Then we add bfqq to the burst. + */ + bfq_add_to_burst(bfqd, bfqq); +end: + /* + * At this point, bfqq either has been added to the current + * burst or has caused the current burst to terminate and a + * possible new burst to start. In particular, in the second + * case, bfqq has become the first queue in the possible new + * burst. In both cases last_ins_in_burst needs to be moved + * forward. + */ + bfqd->last_ins_in_burst = jiffies; + +} + +static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + return entity->budget - entity->service; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static int bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static int bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason); + +/* + * The next function, invoked after the input queue bfqq switches from + * idle to busy, updates the budget of bfqq. The function also tells + * whether the in-service queue should be expired, by returning + * true. The purpose of expiring the in-service queue is to give bfqq + * the chance to possibly preempt the in-service queue, and the reason + * for preempting the in-service queue is to achieve one of the two + * goals below. + * + * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has + * expired because it has remained idle. In particular, bfqq may have + * expired for one of the following two reasons: + * + * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and + * did not make it to issue a new request before its last request + * was served; + * + * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue + * a new request before the expiration of the idling-time. + * + * Even if bfqq has expired for one of the above reasons, the process + * associated with the queue may be however issuing requests greedily, + * and thus be sensitive to the bandwidth it receives (bfqq may have + * remained idle for other reasons: CPU high load, bfqq not enjoying + * idling, I/O throttling somewhere in the path from the process to + * the I/O scheduler, ...). But if, after every expiration for one of + * the above two reasons, bfqq has to wait for the service of at least + * one full budget of another queue before being served again, then + * bfqq is likely to get a much lower bandwidth or resource time than + * its reserved ones. To address this issue, two countermeasures need + * to be taken. + * + * First, the budget and the timestamps of bfqq need to be updated in + * a special way on bfqq reactivation: they need to be updated as if + * bfqq did not remain idle and did not expire. In fact, if they are + * computed as if bfqq expired and remained idle until reactivation, + * then the process associated with bfqq is treated as if, instead of + * being greedy, it stopped issuing requests when bfqq remained idle, + * and restarts issuing requests only on this reactivation. In other + * words, the scheduler does not help the process recover the "service + * hole" between bfqq expiration and reactivation. As a consequence, + * the process receives a lower bandwidth than its reserved one. In + * contrast, to recover this hole, the budget must be updated as if + * bfqq was not expired at all before this reactivation, i.e., it must + * be set to the value of the remaining budget when bfqq was + * expired. Along the same line, timestamps need to be assigned the + * value they had the last time bfqq was selected for service, i.e., + * before last expiration. Thus timestamps need to be back-shifted + * with respect to their normal computation (see [1] for more details + * on this tricky aspect). + * + * Secondly, to allow the process to recover the hole, the in-service + * queue must be expired too, to give bfqq the chance to preempt it + * immediately. In fact, if bfqq has to wait for a full budget of the + * in-service queue to be completed, then it may become impossible to + * let the process recover the hole, even if the back-shifted + * timestamps of bfqq are lower than those of the in-service queue. If + * this happens for most or all of the holes, then the process may not + * receive its reserved bandwidth. In this respect, it is worth noting + * that, being the service of outstanding requests unpreemptible, a + * little fraction of the holes may however be unrecoverable, thereby + * causing a little loss of bandwidth. + * + * The last important point is detecting whether bfqq does need this + * bandwidth recovery. In this respect, the next function deems the + * process associated with bfqq greedy, and thus allows it to recover + * the hole, if: 1) the process is waiting for the arrival of a new + * request (which implies that bfqq expired for one of the above two + * reasons), and 2) such a request has arrived soon. The first + * condition is controlled through the flag non_blocking_wait_rq, + * while the second through the flag arrived_in_time. If both + * conditions hold, then the function computes the budget in the + * above-described special way, and signals that the in-service queue + * should be expired. Timestamp back-shifting is done later in + * __bfq_activate_entity. + * + * 2. Reduce latency. Even if timestamps are not backshifted to let + * the process associated with bfqq recover a service hole, bfqq may + * however happen to have, after being (re)activated, a lower finish + * timestamp than the in-service queue. That is, the next budget of + * bfqq may have to be completed before the one of the in-service + * queue. If this is the case, then preempting the in-service queue + * allows this goal to be achieved, apart from the unpreemptible, + * outstanding requests mentioned above. + * + * Unfortunately, regardless of which of the above two goals one wants + * to achieve, service trees need first to be updated to know whether + * the in-service queue must be preempted. To have service trees + * correctly updated, the in-service queue must be expired and + * rescheduled, and bfqq must be scheduled too. This is one of the + * most costly operations (in future versions, the scheduling + * mechanism may be re-designed in such a way to make it possible to + * know whether preemption is needed without needing to update service + * trees). In addition, queue preemptions almost always cause random + * I/O, and thus loss of throughput. Because of these facts, the next + * function adopts the following simple scheme to avoid both costly + * operations and too frequent preemptions: it requests the expiration + * of the in-service queue (unconditionally) only for queues that need + * to recover a hole, or that either are weight-raised or deserve to + * be weight-raised. + */ +static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool arrived_in_time, + bool wr_or_deserves_wr) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { + /* + * We do not clear the flag non_blocking_wait_rq here, as + * the latter is used in bfq_activate_bfqq to signal + * that timestamps need to be back-shifted (and is + * cleared right after). + */ + + /* + * In next assignment we rely on that either + * entity->service or entity->budget are not updated + * on expiration if bfqq is empty (see + * __bfq_bfqq_recalc_budget). Thus both quantities + * remain unchanged after such an expiration, and the + * following statement therefore assigns to + * entity->budget the remaining budget on such an + * expiration. For clarity, entity->service is not + * updated on expiration in any case, and, in normal + * operation, is reset only when bfqq is selected for + * service (see bfq_get_next_queue). + */ + BUG_ON(bfqq->max_budget < 0); + entity->budget = min_t(unsigned long, + bfq_bfqq_budget_left(bfqq), + bfqq->max_budget); + + BUG_ON(entity->budget < 0); + return true; + } + + BUG_ON(bfqq->max_budget < 0); + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(bfqq->next_rq, bfqq)); + BUG_ON(entity->budget < 0); + + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); + return wr_or_deserves_wr; +} + +static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + unsigned int old_wr_coeff, + bool wr_or_deserves_wr, + bool interactive, + bool in_burst, + bool soft_rt) +{ + if (old_wr_coeff == 1 && wr_or_deserves_wr) { + /* start a weight-raising period */ + if (interactive) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else { + bfqq->wr_start_at_switch_to_srt = jiffies; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + /* + * If needed, further reduce budget to make sure it is + * close to bfqq's backlog, so as to reduce the + * scheduling-error component due to a too large + * budget. Do not care about throughput consequences, + * but only about latency. Finally, do not assign a + * too small budget either, to avoid increasing + * latency by causing too frequent expirations. + */ + bfqq->entity.budget = min_t(unsigned long, + bfqq->entity.budget, + 2 * bfq_min_budget(bfqd)); + + bfq_log_bfqq(bfqd, bfqq, + "wrais starting at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } else if (old_wr_coeff > 1) { + if (interactive) { /* update wr coeff and duration */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else if (in_burst) { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, + "wrais ending at %lu, rais_max_time %u", + jiffies, + jiffies_to_msecs(bfqq-> + wr_cur_max_time)); + } else if (soft_rt) { + /* + * The application is now or still meeting the + * requirements for being deemed soft rt. We + * can then correctly and safely (re)charge + * the weight-raising duration for the + * application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + if (bfqq->wr_cur_max_time != + bfqd->bfq_wr_rt_max_time) { + bfqq->wr_start_at_switch_to_srt = + bfqq->last_wr_start_finish; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfq_log_bfqq(bfqd, bfqq, + "switching to soft_rt wr"); + } else + bfq_log_bfqq(bfqd, bfqq, + "moving forward soft_rt wr duration"); + bfqq->last_wr_start_finish = jiffies; + } + } +} + +static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return bfqq->dispatched == 0 && + time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); +} + +static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int old_wr_coeff, + struct request *rq, + bool *interactive) +{ + bool soft_rt, in_burst, wr_or_deserves_wr, + bfqq_wants_to_preempt, + idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), + /* + * See the comments on + * bfq_bfqq_update_budg_for_activation for + * details on the usage of the next variable. + */ + arrived_in_time = ktime_get_ns() <= + RQ_BIC(rq)->ttime.last_end_request + + bfqd->bfq_slice_idle * 3; + + bfq_log_bfqq(bfqd, bfqq, + "bfq_add_request non-busy: " + "jiffies %lu, in_time %d, idle_long %d busyw %d " + "wr_coeff %u", + jiffies, arrived_in_time, + idle_for_long_time, + bfq_bfqq_non_blocking_wait_rq(bfqq), + old_wr_coeff); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + BUG_ON(bfqq == bfqd->in_service_queue); + bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); + + /* + * bfqq deserves to be weight-raised if: + * - it is sync, + * - it does not belong to a large burst, + * - it has been idle for enough time or is soft real-time, + * - is linked to a bfq_io_cq (it is not shared in any sense) + */ + in_burst = bfq_bfqq_in_large_burst(bfqq); + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); + *interactive = + !in_burst && + idle_for_long_time; + wr_or_deserves_wr = bfqd->low_latency && + (bfqq->wr_coeff > 1 || + (bfq_bfqq_sync(bfqq) && + bfqq->bic && (*interactive || soft_rt))); + + bfq_log_bfqq(bfqd, bfqq, + "bfq_add_request: " + "in_burst %d, " + "soft_rt %d (next %lu), inter %d, bic %p", + bfq_bfqq_in_large_burst(bfqq), soft_rt, + bfqq->soft_rt_next_start, + *interactive, + bfqq->bic); + + /* + * Using the last flag, update budget and check whether bfqq + * may want to preempt the in-service queue. + */ + bfqq_wants_to_preempt = + bfq_bfqq_update_budg_for_activation(bfqd, bfqq, + arrived_in_time, + wr_or_deserves_wr); + + /* + * If bfqq happened to be activated in a burst, but has been + * idle for much more than an interactive queue, then we + * assume that, in the overall I/O initiated in the burst, the + * I/O associated with bfqq is finished. So bfqq does not need + * to be treated as a queue belonging to a burst + * anymore. Accordingly, we reset bfqq's in_large_burst flag + * if set, and remove bfqq from the burst list if it's + * there. We do not decrement burst_size, because the fact + * that bfqq does not need to belong to the burst list any + * more does not invalidate the fact that bfqq was created in + * a burst. + */ + if (likely(!bfq_bfqq_just_created(bfqq)) && + idle_for_long_time && + time_is_before_jiffies( + bfqq->budget_timeout + + msecs_to_jiffies(10000))) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + + bfq_clear_bfqq_just_created(bfqq); + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (arrived_in_time) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + bfq_log_bfqq(bfqd, bfqq, "requests in time %d", + bfqq->requests_within_timer); + } + + if (bfqd->low_latency) { + if (unlikely(time_is_after_jiffies(bfqq->split_time))) + /* wraparound */ + bfqq->split_time = + jiffies - bfqd->bfq_wr_min_idle_time - 1; + + if (time_is_before_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) { + bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, + old_wr_coeff, + wr_or_deserves_wr, + *interactive, + in_burst, + soft_rt); + + if (old_wr_coeff != bfqq->wr_coeff) + bfqq->entity.prio_changed = 1; + } + } + + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + + bfq_add_bfqq_busy(bfqd, bfqq); + + /* + * Expire in-service queue only if preemption may be needed + * for guarantees. In this respect, the function + * next_queue_may_preempt just checks a simple, necessary + * condition, and not a sufficient condition based on + * timestamps. In fact, for the latter condition to be + * evaluated, timestamps would need first to be updated, and + * this operation is quite costly (see the comments on the + * function bfq_bfqq_update_budg_for_activation). + */ + if (bfqd->in_service_queue && bfqq_wants_to_preempt && + bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && + next_queue_may_preempt(bfqd)) { + struct bfq_queue *in_serv = + bfqd->in_service_queue; + BUG_ON(in_serv == bfqq); + + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQ_BFQQ_PREEMPTED); + } +} + +static void bfq_add_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + struct request *next_rq, *prev; + unsigned int old_wr_coeff = bfqq->wr_coeff; + bool interactive = false; + + bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", + blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); + + if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + + elv_rb_add(&bfqq->sort_list, rq); + + /* + * Check if this request is a better next-to-serve candidate. + */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); + BUG_ON(!next_rq); + bfqq->next_rq = next_rq; + + /* + * Adjust priority tree position, if next_rq changes. + */ + if (prev != bfqq->next_rq) + bfq_pos_tree_add_move(bfqd, bfqq); + + if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ + bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, + rq, &interactive); + else { + if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && + time_is_before_jiffies( + bfqq->last_wr_start_finish + + bfqd->bfq_wr_min_inter_arr_async)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + + bfqd->wr_busy_queues++; + BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "non-idle wrais starting, " + "wr_max_time %u wr_busy %d", + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqd->wr_busy_queues); + } + if (prev != bfqq->next_rq) + bfq_updated_next_req(bfqd, bfqq); + } + + /* + * Assign jiffies to last_wr_start_finish in the following + * cases: + * + * . if bfqq is not going to be weight-raised, because, for + * non weight-raised queues, last_wr_start_finish stores the + * arrival time of the last request; as of now, this piece + * of information is used only for deciding whether to + * weight-raise async queues + * + * . if bfqq is not weight-raised, because, if bfqq is now + * switching to weight-raised, then last_wr_start_finish + * stores the time when weight-raising starts + * + * . if bfqq is interactive, because, regardless of whether + * bfqq is currently weight-raised, the weight-raising + * period must start or restart (this case is considered + * separately because it is not detected by the above + * conditions, if bfqq is already weight-raised) + * + * last_wr_start_finish has to be updated also if bfqq is soft + * real-time, because the weight-raising period is constantly + * restarted on idle-to-busy transitions for these queues, but + * this is already done in bfq_bfqq_handle_idle_busy_switch if + * needed. + */ + if (bfqd->low_latency && + (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) + bfqq->last_wr_start_finish = jiffies; +} + +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, + struct bio *bio) +{ + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (!bic) + return NULL; + + bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); + if (bfqq) + return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); + + return NULL; +} + +static sector_t get_sdist(sector_t last_pos, struct request *rq) +{ + sector_t sdist = 0; + + if (last_pos) { + if (last_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - last_pos; + else + sdist = last_pos - blk_rq_pos(rq); + } + + return sdist; +} + +static void bfq_activate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + bfqd->rq_in_driver++; +} + +static void bfq_deactivate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + BUG_ON(bfqd->rq_in_driver == 0); + bfqd->rq_in_driver--; +} + +static void bfq_remove_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + const int sync = rq_is_sync(rq); + + BUG_ON(bfqq->entity.service > bfqq->entity.budget && + bfqq == bfqd->in_service_queue); + + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); + bfq_updated_next_req(bfqd, bfqq); + } + + if (rq->queuelist.prev != &rq->queuelist) + list_del_init(&rq->queuelist); + BUG_ON(bfqq->queued[sync] == 0); + bfqq->queued[sync]--; + bfqd->queued--; + elv_rb_del(&bfqq->sort_list, rq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + bfqq->next_rq = NULL; + + BUG_ON(bfqq->entity.budget < 0); + + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { + BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ + bfq_del_bfqq_busy(bfqd, bfqq, false); + /* + * bfqq emptied. In normal operation, when + * bfqq is empty, bfqq->entity.service and + * bfqq->entity.budget must contain, + * respectively, the service received and the + * budget used last time bfqq emptied. These + * facts do not hold in this case, as at least + * this last removal occurred while bfqq is + * not in service. To avoid inconsistencies, + * reset both bfqq->entity.service and + * bfqq->entity.budget, if bfqq has still a + * process that may issue I/O requests to it. + */ + bfqq->entity.budget = bfqq->entity.service = 0; + } + + /* + * Remove queue from request-position tree as it is empty. + */ + if (bfqq->pos_root) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + } + + if (rq->cmd_flags & REQ_META) { + BUG_ON(bfqq->meta_pending == 0); + bfqq->meta_pending--; + } + bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); +} + +static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *__rq; + + __rq = bfq_find_rq_fmerge(bfqd, bio); + if (__rq && elv_bio_merge_ok(__rq, bio)) { + *req = __rq; + return ELEVATOR_FRONT_MERGE; + } + + return ELEVATOR_NO_MERGE; +} + +static void bfq_merged_request(struct request_queue *q, struct request *req, + enum elv_merge type) +{ + if (type == ELEVATOR_FRONT_MERGE && + rb_prev(&req->rb_node) && + blk_rq_pos(req) < + blk_rq_pos(container_of(rb_prev(&req->rb_node), + struct request, rb_node))) { + struct bfq_queue *bfqq = RQ_BFQQ(req); + struct bfq_data *bfqd = bfqq->bfqd; + struct request *prev, *next_rq; + + /* Reposition request in its sort_list */ + elv_rb_del(&bfqq->sort_list, req); + elv_rb_add(&bfqq->sort_list, req); + /* Choose next request to be served for bfqq */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, + bfqd->last_position); + BUG_ON(!next_rq); + bfqq->next_rq = next_rq; + /* + * If next_rq changes, update both the queue's budget to + * fit the new request and the queue's position in its + * rq_pos_tree. + */ + if (prev != bfqq->next_rq) { + bfq_updated_next_req(bfqd, bfqq); + bfq_pos_tree_add_move(bfqd, bfqq); + } + } +} + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +static void bfq_bio_merged(struct request_queue *q, struct request *req, + struct bio *bio) +{ + bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); +} +#endif + +static void bfq_merged_requests(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + + /* + * If next and rq belong to the same bfq_queue and next is older + * than rq, then reposition rq in the fifo (by substituting next + * with rq). Otherwise, if next and rq belong to different + * bfq_queues, never reposition rq: in fact, we would have to + * reposition it with respect to next's position in its own fifo, + * which would most certainly be too expensive with respect to + * the benefits. + */ + if (bfqq == next_bfqq && + !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && + next->fifo_time < rq->fifo_time) { + list_del_init(&rq->queuelist); + list_replace_init(&next->queuelist, &rq->queuelist); + rq->fifo_time = next->fifo_time; + } + + if (bfqq->next_rq == next) + bfqq->next_rq = rq; + + bfq_remove_request(next); + bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +} + +/* Must be called with bfqq != NULL */ +static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) +{ + BUG_ON(!bfqq); + + if (bfq_bfqq_busy(bfqq)) { + bfqq->bfqd->wr_busy_queues--; + BUG_ON(bfqq->bfqd->wr_busy_queues < 0); + } + bfqq->wr_coeff = 1; + bfqq->wr_cur_max_time = 0; + bfqq->last_wr_start_finish = jiffies; + /* + * Trigger a weight change on the next invocation of + * __bfq_entity_update_weight_prio. + */ + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "end_wr: wrais ending at %lu, rais_max_time %u", + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", + bfqq->bfqd->wr_busy_queues); +} + +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + if (bfqg->async_bfqq[i][j]) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); + if (bfqg->async_idle_bfqq) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq); +} + +static void bfq_end_wr(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + + spin_lock_irq(bfqd->queue->queue_lock); + + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + bfq_end_wr_async(bfqd); + + spin_unlock_irq(bfqd->queue->queue_lock); +} + +static sector_t bfq_io_struct_pos(void *io_struct, bool request) +{ + if (request) + return blk_rq_pos(io_struct); + else + return ((struct bio *)io_struct)->bi_iter.bi_sector; +} + +static int bfq_rq_close_to_sector(void *io_struct, bool request, + sector_t sector) +{ + return abs(bfq_io_struct_pos(io_struct, request) - sector) <= + BFQQ_CLOSE_THR; +} + +static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + sector_t sector) +{ + struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + struct rb_node *parent, *node; + struct bfq_queue *__bfqq; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + /* + * First, if we find a request starting at the end of the last + * request, choose it. + */ + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); + if (__bfqq) + return __bfqq; + + /* + * If the exact sector wasn't found, the parent of the NULL leaf + * will contain the closest sector (rq_pos_tree sorted by + * next_request position). + */ + __bfqq = rb_entry(parent, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + if (blk_rq_pos(__bfqq->next_rq) < sector) + node = rb_next(&__bfqq->pos_node); + else + node = rb_prev(&__bfqq->pos_node); + if (!node) + return NULL; + + __bfqq = rb_entry(node, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + return NULL; +} + +static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq, + sector_t sector) +{ + struct bfq_queue *bfqq; + + /* + * We shall notice if some of the queues are cooperating, + * e.g., working closely on the same area of the device. In + * that case, we can group them together and: 1) don't waste + * time idling, and 2) serve the union of their requests in + * the best possible order for throughput. + */ + bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); + if (!bfqq || bfqq == cur_bfqq) + return NULL; + + return bfqq; +} + +static struct bfq_queue * +bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + int process_refs, new_process_refs; + struct bfq_queue *__bfqq; + + /* + * If there are no process references on the new_bfqq, then it is + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain + * may have dropped their last reference (not just their last process + * reference). + */ + if (!bfqq_process_refs(new_bfqq)) + return NULL; + + /* Avoid a circular list and skip interim queue merges. */ + while ((__bfqq = new_bfqq->new_bfqq)) { + if (__bfqq == bfqq) + return NULL; + new_bfqq = __bfqq; + } + + process_refs = bfqq_process_refs(bfqq); + new_process_refs = bfqq_process_refs(new_bfqq); + /* + * If the process for the bfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0 || new_process_refs == 0) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", + new_bfqq->pid); + + /* + * Merging is just a redirection: the requests of the process + * owning one of the two queues are redirected to the other queue. + * The latter queue, in its turn, is set as shared if this is the + * first time that the requests of some process are redirected to + * it. + * + * We redirect bfqq to new_bfqq and not the opposite, because we + * are in the context of the process owning bfqq, hence we have + * the io_cq of this process. So we can immediately configure this + * io_cq to redirect the requests of the process to new_bfqq. + * + * NOTE, even if new_bfqq coincides with the in-service queue, the + * io_cq of new_bfqq is not available, because, if the in-service + * queue is shared, bfqd->in_service_bic may not point to the + * io_cq of the in-service queue. + * Redirecting the requests of the process owning bfqq to the + * currently in-service queue is in any case the best option, as + * we feed the in-service queue with new requests close to the + * last request served and, by doing so, hopefully increase the + * throughput. + */ + bfqq->new_bfqq = new_bfqq; + new_bfqq->ref += process_refs; + return new_bfqq; +} + +static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + struct bfq_queue *new_bfqq) +{ + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || + (bfqq->ioprio_class != new_bfqq->ioprio_class)) + return false; + + /* + * If either of the queues has already been detected as seeky, + * then merging it with the other queue is unlikely to lead to + * sequential I/O. + */ + if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) + return false; + + /* + * Interleaved I/O is known to be done by (some) applications + * only for reads, so it does not make sense to merge async + * queues. + */ + if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) + return false; + + return true; +} + +/* + * If this function returns true, then bfqq cannot be merged. The idea + * is that true cooperation happens very early after processes start + * to do I/O. Usually, late cooperations are just accidental false + * positives. In case bfqq is weight-raised, such false positives + * would evidently degrade latency guarantees for bfqq. + */ +static bool wr_from_too_long(struct bfq_queue *bfqq) +{ + return bfqq->wr_coeff > 1 && + time_is_before_jiffies(bfqq->last_wr_start_finish + + msecs_to_jiffies(100)); +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service + * queue or with a close queue among the scheduled queues. Return + * NULL if no merge was scheduled, a pointer to the shared bfq_queue + * structure otherwise. + * + * The OOM queue is not allowed to participate to cooperation: in fact, since + * the requests temporarily redirected to the OOM queue could be redirected + * again to dedicated queues at any time, the state needed to correctly + * handle merging with the OOM queue would be quite complex and expensive + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. + * + * Weight-raised queues can be merged only if their weight-raising + * period has just started. In fact cooperating processes are usually + * started together. Thus, with this filter we avoid false positives + * that would jeopardize low-latency guarantees. + * + * WARNING: queue merging may impair fairness among non-weight raised + * queues, for at least two reasons: 1) the original weight of a + * merged queue may change during the merged state, 2) even being the + * weight the same, a merged queue may be bloated with many more + * requests than the ones produced by its originally-associated + * process. + */ +static struct bfq_queue * +bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + void *io_struct, bool request) +{ + struct bfq_queue *in_service_bfqq, *new_bfqq; + + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + + if (io_struct && wr_from_too_long(bfqq) && + likely(bfqq != &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have looked for coop, but bfq%d wr", + bfqq->pid); + + if (!io_struct || + wr_from_too_long(bfqq) || + unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + + /* If there is only one backlogged queue, don't search. */ + if (bfqd->busy_queues == 1) + return NULL; + + in_service_bfqq = bfqd->in_service_queue; + + if (in_service_bfqq && in_service_bfqq != bfqq && + bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) + && likely(in_service_bfqq == &bfqd->oom_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have tried merge with in-service-queue, but wr"); + + if (!in_service_bfqq || in_service_bfqq == bfqq || + !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || + unlikely(in_service_bfqq == &bfqd->oom_bfqq)) + goto check_scheduled; + + if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && + bfqq->entity.parent == in_service_bfqq->entity.parent && + bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { + new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); + if (new_bfqq) + return new_bfqq; + } + /* + * Check whether there is a cooperator among currently scheduled + * queues. The only thing we need is that the bio/request is not + * NULL, as we need it to establish whether a cooperator exists. + */ +check_scheduled: + new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + + BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); + + if (new_bfqq && wr_from_too_long(new_bfqq) && + likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + bfq_log_bfqq(bfqd, bfqq, + "would have merged with bfq%d, but wr", + new_bfqq->pid); + + if (new_bfqq && !wr_from_too_long(new_bfqq) && + likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + + return NULL; +} + +static void bfq_bfqq_save_state(struct bfq_queue *bfqq) +{ + struct bfq_io_cq *bic = bfqq->bic; + + /* + * If !bfqq->bic, the queue is already shared or its requests + * have already been redirected to a shared queue; both idle window + * and weight raising state have already been saved. Do nothing. + */ + if (!bic) + return; + + bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); +} + +static void bfq_get_bic_reference(struct bfq_queue *bfqq) +{ + /* + * If bfqq->bic has a non-NULL value, the bic to which it belongs + * is about to begin using a shared bfq_queue. + */ + if (bfqq->bic) + atomic_long_inc(&bfqq->bic->icq.ioc->refcount); +} + +static void +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (unsigned long) new_bfqq->pid); + /* Save weight raising and idle window of the merged queues */ + bfq_bfqq_save_state(bfqq); + bfq_bfqq_save_state(new_bfqq); + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); + + /* + * If bfqq is weight-raised, then let new_bfqq inherit + * weight-raising. To reduce false positives, neglect the case + * where bfqq has just been created, but has not yet made it + * to be weight-raised (which may happen because EQM may merge + * bfqq even before bfq_add_request is executed for the first + * time for bfqq). Handling this case would however be very + * easy, thanks to the flag just_created. + */ + if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { + new_bfqq->wr_coeff = bfqq->wr_coeff; + new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; + new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; + new_bfqq->wr_start_at_switch_to_srt = + bfqq->wr_start_at_switch_to_srt; + if (bfq_bfqq_busy(new_bfqq)) { + bfqd->wr_busy_queues++; + BUG_ON(bfqd->wr_busy_queues > bfqd->busy_queues); + } + + new_bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, new_bfqq, + "wr start after merge with %d, rais_max_time %u", + bfqq->pid, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ + bfqq->wr_coeff = 1; + bfqq->entity.prio_changed = 1; + if (bfq_bfqq_busy(bfqq)) { + bfqd->wr_busy_queues--; + BUG_ON(bfqd->wr_busy_queues < 0); + } + + } + + bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", + bfqd->wr_busy_queues); + + /* + * Grab a reference to the bic, to prevent it from being destroyed + * before being possibly touched by a bfq_split_bfqq(). + */ + bfq_get_bic_reference(bfqq); + bfq_get_bic_reference(new_bfqq); + /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ + bic_set_bfqq(bic, new_bfqq, 1); + bfq_mark_bfqq_coop(new_bfqq); + /* + * new_bfqq now belongs to at least two bics (it is a shared queue): + * set new_bfqq->bic to NULL. bfqq either: + * - does not belong to any bic any more, and hence bfqq->bic must + * be set to NULL, or + * - is a queue whose owning bics have already been redirected to a + * different queue, hence the queue is destined to not belong to + * any bic soon and bfqq->bic is already NULL (therefore the next + * assignment causes no harm). + */ + new_bfqq->bic = NULL; + bfqq->bic = NULL; + /* release process reference to bfqq */ + bfq_put_queue(bfqq); +} + +static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + bool is_sync = op_is_sync(bio->bi_opf); + struct bfq_io_cq *bic; + struct bfq_queue *bfqq, *new_bfqq; + + /* + * Disallow merge of a sync bio into an async request. + */ + if (is_sync && !rq_is_sync(rq)) + return false; + + /* + * Lookup the bfqq that this bio will be queued with. Allow + * merge only if rq is queued there. + * Queue lock is held here. + */ + bic = bfq_bic_lookup(bfqd, current->io_context); + if (!bic) + return false; + + bfqq = bic_to_bfqq(bic, is_sync); + /* + * We take advantage of this function to perform an early merge + * of the queues of possible cooperating processes. + */ + if (bfqq) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + if (new_bfqq) { + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + /* + * If we get here, the bio will be queued in the + * shared queue, i.e., new_bfqq, so use new_bfqq + * to decide whether bio and rq can be merged. + */ + bfqq = new_bfqq; + } + } + + return bfqq == RQ_BFQQ(rq); +} + +static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, + struct request *next) +{ + return RQ_BFQQ(rq) == RQ_BFQQ(next); +} + +/* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the throughput. + * In practice, a time-slice service scheme is used with seeky + * processes. + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + unsigned int timeout_coeff; + + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout * timeout_coeff; + + bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", + jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); +} + +static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + if (bfqq) { + bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); + bfq_mark_bfqq_must_alloc(bfqq); + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + + BUG_ON(bfqq == bfqd->in_service_queue); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + if (time_is_before_jiffies(bfqq->last_wr_start_finish) && + bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_before_jiffies(bfqq->budget_timeout)) { + /* + * For soft real-time queues, move the start + * of the weight-raising period forward by the + * time the queue has not received any + * service. Otherwise, a relatively long + * service delay is likely to cause the + * weight-raising period of the queue to end, + * because of the short duration of the + * weight-raising period of a soft real-time + * queue. It is worth noting that this move + * is not so dangerous for the other queues, + * because soft real-time queues are not + * greedy. + * + * To not add a further variable, we use the + * overloaded field budget_timeout to + * determine for how long the queue has not + * received service, i.e., how much time has + * elapsed since the queue expired. However, + * this is a little imprecise, because + * budget_timeout is set to jiffies if bfqq + * not only expires, but also remains with no + * request. + */ + if (time_after(bfqq->budget_timeout, + bfqq->last_wr_start_finish)) + bfqq->last_wr_start_finish += + jiffies - bfqq->budget_timeout; + else + bfqq->last_wr_start_finish = jiffies; + + if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { + pr_crit( + "BFQ WARNING:last %lu budget %lu jiffies %lu", + bfqq->last_wr_start_finish, + bfqq->budget_timeout, + jiffies); + pr_crit("diff %lu", jiffies - + max_t(unsigned long, + bfqq->last_wr_start_finish, + bfqq->budget_timeout)); + bfqq->last_wr_start_finish = jiffies; + } + } + + bfq_set_budget_timeout(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, + "set_in_service_queue, cur-budget = %d", + bfqq->entity.budget); + } else + bfq_log(bfqd, "set_in_service_queue: NULL"); + + bfqd->in_service_queue = bfqq; +} + +/* + * Get and set a new queue for service. + */ +static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); + + __bfq_set_in_service_queue(bfqd, bfqq); + return bfqq; +} + +static void bfq_arm_slice_timer(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + struct bfq_io_cq *bic; + u32 sl; + + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + + /* Processes have exited, don't wait. */ + bic = bfqd->in_service_bic; + if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) + return; + + bfq_mark_bfqq_wait_request(bfqq); + + /* + * We don't want to idle for seeks, but we do want to allow + * fair distribution of slice time for a process doing back-to-back + * seeks. So allow a little bit of time for him to submit a new rq. + * + * To prevent processes with (partly) seeky workloads from + * being too ill-treated, grant them a small fraction of the + * assigned budget before reducing the waiting time to + * BFQ_MIN_TT. This happened to help reduce latency. + */ + sl = bfqd->bfq_slice_idle; + /* + * Unless the queue is being weight-raised or the scenario is + * asymmetric, grant only minimum idle time if the queue + * is seeky. A long idling is preserved for a weight-raised + * queue, or, more in general, in an asymemtric scenario, + * because a long idling is needed for guaranteeing to a queue + * its reserved share of the throughput (in particular, it is + * needed if the queue has a higher weight than some other + * queue). + */ + if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && + bfq_symmetric_scenario(bfqd)) + sl = min_t(u32, sl, BFQ_MIN_TT); + + bfqd->last_idling_start = ktime_get(); + hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), + HRTIMER_MODE_REL); + bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); + bfq_log(bfqd, "arm idle: %ld/%ld ms", + sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); +} + +/* + * In autotuning mode, max_budget is dynamically recomputed as the + * amount of sectors transferred in timeout at the estimated peak + * rate. This enables BFQ to utilize a full timeslice with a full + * budget, even if the in-service queue is served at peak rate. And + * this maximises throughput with sequential workloads. + */ +static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) +{ + return (u64)bfqd->peak_rate * USEC_PER_MSEC * + jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; +} + +/* + * Update parameters related to throughput and responsiveness, as a + * function of the estimated peak rate. See comments on + * bfq_calc_max_budget(), and on T_slow and T_fast arrays. + */ +static void update_thr_responsiveness_params(struct bfq_data *bfqd) +{ + int dev_type = blk_queue_nonrot(bfqd->queue); + + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd); + BUG_ON(bfqd->bfq_max_budget < 0); + bfq_log(bfqd, "new max_budget = %d", + bfqd->bfq_max_budget); + } + + if (bfqd->device_speed == BFQ_BFQD_FAST && + bfqd->peak_rate < device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_SLOW; + bfqd->RT_prod = R_slow[dev_type] * + T_slow[dev_type]; + } else if (bfqd->device_speed == BFQ_BFQD_SLOW && + bfqd->peak_rate > device_speed_thresh[dev_type]) { + bfqd->device_speed = BFQ_BFQD_FAST; + bfqd->RT_prod = R_fast[dev_type] * + T_fast[dev_type]; + } + + bfq_log(bfqd, +"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", + dev_type == 0 ? "ROT" : "NONROT", + bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", + bfqd->device_speed == BFQ_BFQD_FAST ? + (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : + (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> + BFQ_RATE_SHIFT); +} + +static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) +{ + if (rq != NULL) { /* new rq dispatch now, reset accordingly */ + bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; + bfqd->peak_rate_samples = 1; + bfqd->sequential_samples = 0; + bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = + blk_rq_sectors(rq); + } else /* no new rq dispatched, just reset the number of samples */ + bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ + + bfq_log(bfqd, + "reset_rate_computation at end, sample %u/%u tot_sects %llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched); +} + +static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) +{ + u32 rate, weight, divisor; + + /* + * For the convergence property to hold (see comments on + * bfq_update_peak_rate()) and for the assessment to be + * reliable, a minimum number of samples must be present, and + * a minimum amount of time must have elapsed. If not so, do + * not compute new rate. Just reset parameters, to get ready + * for a new evaluation attempt. + */ + if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || + bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { + bfq_log(bfqd, + "update_rate_reset: only resetting, delta_first %lluus samples %d", + bfqd->delta_from_first>>10, bfqd->peak_rate_samples); + goto reset_computation; + } + + /* + * If a new request completion has occurred after last + * dispatch, then, to approximate the rate at which requests + * have been served by the device, it is more precise to + * extend the observation interval to the last completion. + */ + bfqd->delta_from_first = + max_t(u64, bfqd->delta_from_first, + bfqd->last_completion - bfqd->first_dispatch); + + BUG_ON(bfqd->delta_from_first == 0); + /* + * Rate computed in sects/usec, and not sects/nsec, for + * precision issues. + */ + rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); + + bfq_log(bfqd, +"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", + bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + rate > 20< 20M sectors/sec) + */ + if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && + rate <= bfqd->peak_rate) || + rate > 20<peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + goto reset_computation; + } else { + bfq_log(bfqd, + "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + } + + /* + * We have to update the peak rate, at last! To this purpose, + * we use a low-pass filter. We compute the smoothing constant + * of the filter as a function of the 'weight' of the new + * measured rate. + * + * As can be seen in next formulas, we define this weight as a + * quantity proportional to how sequential the workload is, + * and to how long the observation time interval is. + * + * The weight runs from 0 to 8. The maximum value of the + * weight, 8, yields the minimum value for the smoothing + * constant. At this minimum value for the smoothing constant, + * the measured rate contributes for half of the next value of + * the estimated peak rate. + * + * So, the first step is to compute the weight as a function + * of how sequential the workload is. Note that the weight + * cannot reach 9, because bfqd->sequential_samples cannot + * become equal to bfqd->peak_rate_samples, which, in its + * turn, holds true because bfqd->sequential_samples is not + * incremented for the first sample. + */ + weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; + + /* + * Second step: further refine the weight as a function of the + * duration of the observation interval. + */ + weight = min_t(u32, 8, + div_u64(weight * bfqd->delta_from_first, + BFQ_RATE_REF_INTERVAL)); + + /* + * Divisor ranging from 10, for minimum weight, to 2, for + * maximum weight. + */ + divisor = 10 - weight; + BUG_ON(divisor == 0); + + /* + * Finally, update peak rate: + * + * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor + */ + bfqd->peak_rate *= divisor-1; + bfqd->peak_rate /= divisor; + rate /= divisor; /* smoothing constant alpha = 1/divisor */ + + bfq_log(bfqd, + "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", + divisor, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), + (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); + + BUG_ON(bfqd->peak_rate == 0); + BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; + update_thr_responsiveness_params(bfqd); + BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ + bfq_log(bfqd, + "update_peak_rate: goto reset, samples %d", + bfqd->peak_rate_samples) ; + bfq_reset_rate_computation(bfqd, rq); + goto update_last_values; /* will add one sample */ + } + + /* + * Device idle for very long: the observation interval lasting + * up to this dispatch cannot be a valid observation interval + * for computing a new peak rate (similarly to the late- + * completion event in bfq_completed_request()). Go to + * update_rate_and_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - start a new observation interval with this dispatch + */ + if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && + bfqd->rq_in_driver == 0) { + bfq_log(bfqd, +"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", + (now_ns - bfqd->last_dispatch)>>10, + bfqd->peak_rate_samples) ; + goto update_rate_and_reset; + } + + /* Update sampling information */ + bfqd->peak_rate_samples++; + + if ((bfqd->rq_in_driver > 0 || + now_ns - bfqd->last_completion < BFQ_MIN_TT) + && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) + bfqd->sequential_samples++; + + bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); + + /* Reset max observed rq size every 32 dispatches */ + if (likely(bfqd->peak_rate_samples % 32)) + bfqd->last_rq_max_size = + max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); + else + bfqd->last_rq_max_size = blk_rq_sectors(rq); + + bfqd->delta_from_first = now_ns - bfqd->first_dispatch; + + bfq_log(bfqd, + "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched, + bfqd->delta_from_first>>10); + + /* Target observation interval not yet reached, go on sampling */ + if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) + goto update_last_values; + +update_rate_and_reset: + bfq_update_rate_reset(bfqd, rq); +update_last_values: + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + bfqd->last_dispatch = now_ns; + + bfq_log(bfqd, + "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", + (now_ns - bfqd->first_dispatch)>>10, + (unsigned long long) bfqd->last_position, + ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); + bfq_log(bfqd, + "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); +} + +/* + * Move request from internal lists to the dispatch list of the request queue + */ +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + /* + * For consistency, the next instruction should have been executed + * after removing the request from the queue and dispatching it. + * We execute instead this instruction before bfq_remove_request() + * (and hence introduce a temporary inconsistency), for efficiency. + * In fact, in a forced_dispatch, this prevents two counters related + * to bfqq->dispatched to risk to be uselessly decremented if bfqq + * is not in service, and then to be incremented again after + * incrementing bfqq->dispatched. + */ + bfqq->dispatched++; + bfq_update_peak_rate(q->elevator->elevator_data, rq); + + bfq_remove_request(rq); + elv_dispatch_sort(q, rq); +} + +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfqq != bfqd->in_service_queue); + + /* + * If this bfqq is shared between multiple processes, check + * to make sure that those processes are still issuing I/Os + * within the mean seek distance. If not, it may be time to + * break the queues apart again. + */ + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) + bfq_mark_bfqq_split_coop(bfqq); + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + if (bfqq->dispatched == 0) + /* + * Overloading budget_timeout field to store + * the time at which the queue remains with no + * backlog and no outstanding request; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + + bfq_del_bfqq_busy(bfqd, bfqq, true); + } else { + bfq_requeue_bfqq(bfqd, bfqq); + /* + * Resort priority tree of potential close cooperators. + */ + bfq_pos_tree_add_move(bfqd, bfqq); + } + + /* + * All in-service entities must have been properly deactivated + * or requeued before executing the next function, which + * resets all in-service entites as no more in service. + */ + __bfq_bfqd_reset_in_service(bfqd); +} + +/** + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. + * @bfqd: device data. + * @bfqq: queue to update. + * @reason: reason for expiration. + * + * Handle the feedback on @bfqq budget at queue expiration. + * See the body for detailed comments. + */ +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + enum bfqq_expiration reason) +{ + struct request *next_rq; + int budget, min_budget; + + BUG_ON(bfqq != bfqd->in_service_queue); + + min_budget = bfq_min_budget(bfqd); + + if (bfqq->wr_coeff == 1) + budget = bfqq->max_budget; + else /* + * Use a constant, low budget for weight-raised queues, + * to help achieve a low latency. Keep it slightly higher + * than the minimum possible budget, to cause a little + * bit fewer expirations. + */ + budget = 2 * min_budget; + + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", + budget, bfq_min_budget(bfqd)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + + if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { + switch (reason) { + /* + * Caveat: in all the following cases we trade latency + * for throughput. + */ + case BFQ_BFQQ_TOO_IDLE: + /* + * This is the only case where we may reduce + * the budget: if there is no request of the + * process still waiting for completion, then + * we assume (tentatively) that the timer has + * expired because the batch of requests of + * the process could have been served with a + * smaller budget. Hence, betting that + * process will behave in the same way when it + * becomes backlogged again, we reduce its + * next budget. As long as we guess right, + * this budget cut reduces the latency + * experienced by the process. + * + * However, if there are still outstanding + * requests, then the process may have not yet + * issued its next request just because it is + * still waiting for the completion of some of + * the still outstanding ones. So in this + * subcase we do not reduce its budget, on the + * contrary we increase it to possibly boost + * the throughput, as discussed in the + * comments to the BUDGET_TIMEOUT case. + */ + if (bfqq->dispatched > 0) /* still outstanding reqs */ + budget = min(budget * 2, bfqd->bfq_max_budget); + else { + if (budget > 5 * min_budget) + budget -= 4 * min_budget; + else + budget = min_budget; + } + break; + case BFQ_BFQQ_BUDGET_TIMEOUT: + /* + * We double the budget here because it gives + * the chance to boost the throughput if this + * is not a seeky process (and has bumped into + * this timeout because of, e.g., ZBR). + */ + budget = min(budget * 2, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_BUDGET_EXHAUSTED: + /* + * The process still has backlog, and did not + * let either the budget timeout or the disk + * idling timeout expire. Hence it is not + * seeky, has a short thinktime and may be + * happy with a higher budget too. So + * definitely increase the budget of this good + * candidate to boost the disk throughput. + */ + budget = min(budget * 4, bfqd->bfq_max_budget); + break; + case BFQ_BFQQ_NO_MORE_REQUESTS: + /* + * For queues that expire for this reason, it + * is particularly important to keep the + * budget close to the actual service they + * need. Doing so reduces the timestamp + * misalignment problem described in the + * comments in the body of + * __bfq_activate_entity. In fact, suppose + * that a queue systematically expires for + * BFQ_BFQQ_NO_MORE_REQUESTS and presents a + * new request in time to enjoy timestamp + * back-shifting. The larger the budget of the + * queue is with respect to the service the + * queue actually requests in each service + * slot, the more times the queue can be + * reactivated with the same virtual finish + * time. It follows that, even if this finish + * time is pushed to the system virtual time + * to reduce the consequent timestamp + * misalignment, the queue unjustly enjoys for + * many re-activations a lower finish time + * than all newly activated queues. + * + * The service needed by bfqq is measured + * quite precisely by bfqq->entity.service. + * Since bfqq does not enjoy device idling, + * bfqq->entity.service is equal to the number + * of sectors that the process associated with + * bfqq requested to read/write before waiting + * for request completions, or blocking for + * other reasons. + */ + budget = max_t(int, bfqq->entity.service, min_budget); + break; + default: + return; + } + } else if (!bfq_bfqq_sync(bfqq)) + /* + * Async queues get always the maximum possible + * budget, as for them we do not care about latency + * (in addition, their ability to dispatch is limited + * by the charging factor). + */ + budget = bfqd->bfq_max_budget; + + bfqq->max_budget = budget; + + if (bfqd->budgets_assigned >= bfq_stats_min_budgets && + !bfqd->bfq_user_max_budget) + bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); + + /* + * If there is still backlog, then assign a new budget, making + * sure that it is large enough for the next request. Since + * the finish time of bfqq must be kept in sync with the + * budget, be sure to call __bfq_bfqq_expire() *after* this + * update. + * + * If there is no backlog, then no need to update the budget; + * it will be updated on the arrival of a new request. + */ + next_rq = bfqq->next_rq; + if (next_rq) { + BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || + reason == BFQ_BFQQ_NO_MORE_REQUESTS); + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + BUG_ON(!bfq_bfqq_busy(bfqq)); + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + } + + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", + next_rq ? blk_rq_sectors(next_rq) : 0, + bfqq->entity.budget); +} + +/* + * Return true if the process associated with bfqq is "slow". The slow + * flag is used, in addition to the budget timeout, to reduce the + * amount of service provided to seeky processes, and thus reduce + * their chances to lower the throughput. More details in the comments + * on the function bfq_bfqq_expire(). + * + * An important observation is in order: as discussed in the comments + * on the function bfq_update_peak_rate(), with devices with internal + * queues, it is hard if ever possible to know when and for how long + * an I/O request is processed by the device (apart from the trivial + * I/O pattern where a new request is dispatched only after the + * previous one has been completed). This makes it hard to evaluate + * the real rate at which the I/O requests of each bfq_queue are + * served. In fact, for an I/O scheduler like BFQ, serving a + * bfq_queue means just dispatching its requests during its service + * slot (i.e., until the budget of the queue is exhausted, or the + * queue remains idle, or, finally, a timeout fires). But, during the + * service slot of a bfq_queue, around 100 ms at most, the device may + * be even still processing requests of bfq_queues served in previous + * service slots. On the opposite end, the requests of the in-service + * bfq_queue may be completed after the service slot of the queue + * finishes. + * + * Anyway, unless more sophisticated solutions are used + * (where possible), the sum of the sizes of the requests dispatched + * during the service slot of a bfq_queue is probably the only + * approximation available for the service received by the bfq_queue + * during its service slot. And this sum is the quantity used in this + * function to evaluate the I/O speed of a process. + */ +static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool compensate, enum bfqq_expiration reason, + unsigned long *delta_ms) +{ + ktime_t delta_ktime; + u32 delta_usecs; + bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ + + if (!bfq_bfqq_sync(bfqq)) + return false; + + if (compensate) + delta_ktime = bfqd->last_idling_start; + else + delta_ktime = ktime_get(); + delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); + delta_usecs = ktime_to_us(delta_ktime); + + /* don't use too short time intervals */ + if (delta_usecs < 1000) { + if (blk_queue_nonrot(bfqd->queue)) + /* + * give same worst-case guarantees as idling + * for seeky + */ + *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; + else /* charge at least one seek */ + *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; + + bfq_log(bfqd, "bfq_bfqq_is_slow: too short %u", delta_usecs); + + return slow; + } + + *delta_ms = delta_usecs / USEC_PER_MSEC; + + /* + * Use only long (> 20ms) intervals to filter out excessive + * spikes in service rate estimation. + */ + if (delta_usecs > 20000) { + /* + * Caveat for rotational devices: processes doing I/O + * in the slower disk zones tend to be slow(er) even + * if not seeky. In this respect, the estimated peak + * rate is likely to be an average over the disk + * surface. Accordingly, to not be too harsh with + * unlucky processes, a process is deemed slow only if + * its rate has been lower than half of the estimated + * peak rate. + */ + slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; + bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", + bfqq->entity.service, bfqd->bfq_max_budget); + } + + bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); + + return slow; +} + +/* + * To be deemed as soft real-time, an application must meet two + * requirements. First, the application must not require an average + * bandwidth higher than the approximate bandwidth required to playback or + * record a compressed high-definition video. + * The next function is invoked on the completion of the last request of a + * batch, to compute the next-start time instant, soft_rt_next_start, such + * that, if the next request of the application does not arrive before + * soft_rt_next_start, then the above requirement on the bandwidth is met. + * + * The second requirement is that the request pattern of the application is + * isochronous, i.e., that, after issuing a request or a batch of requests, + * the application stops issuing new requests until all its pending requests + * have been completed. After that, the application may issue a new batch, + * and so on. + * For this reason the next function is invoked to compute + * soft_rt_next_start only for applications that meet this requirement, + * whereas soft_rt_next_start is set to infinity for applications that do + * not. + * + * Unfortunately, even a greedy application may happen to behave in an + * isochronous way if the CPU load is high. In fact, the application may + * stop issuing requests while the CPUs are busy serving other processes, + * then restart, then stop again for a while, and so on. In addition, if + * the disk achieves a low enough throughput with the request pattern + * issued by the application (e.g., because the request pattern is random + * and/or the device is slow), then the application may meet the above + * bandwidth requirement too. To prevent such a greedy application to be + * deemed as soft real-time, a further rule is used in the computation of + * soft_rt_next_start: soft_rt_next_start must be higher than the current + * time plus the maximum time for which the arrival of a request is waited + * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. + * This filters out greedy applications, as the latter issue instead their + * next request as soon as possible after the last one has been completed + * (in contrast, when a batch of requests is completed, a soft real-time + * application spends some time processing data). + * + * Unfortunately, the last filter may easily generate false positives if + * only bfqd->bfq_slice_idle is used as a reference time interval and one + * or both the following cases occur: + * 1) HZ is so low that the duration of a jiffy is comparable to or higher + * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with + * HZ=100. + * 2) jiffies, instead of increasing at a constant rate, may stop increasing + * for a while, then suddenly 'jump' by several units to recover the lost + * increments. This seems to happen, e.g., inside virtual machines. + * To address this issue, we do not use as a reference time interval just + * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In + * particular we add the minimum number of jiffies for which the filter + * seems to be quite precise also in embedded systems and KVM/QEMU virtual + * machines. + */ +static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, +"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", + bfqq->service_from_backlogged, + bfqd->bfq_wr_max_softrt_rate, + jiffies_to_msecs(HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate)); + + return max(bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); +} + +/* + * Return the farthest future time instant according to jiffies + * macros. + */ +static unsigned long bfq_greatest_from_now(void) +{ + return jiffies + MAX_JIFFY_OFFSET; +} + +/* + * Return the farthest past time instant according to jiffies + * macros. + */ +static unsigned long bfq_smallest_from_now(void) +{ + return jiffies - MAX_JIFFY_OFFSET; +} + +/** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. + * @bfqq: the queue to expire. + * @compensate: if true, compensate for the time spent idling. + * @reason: the reason causing the expiration. + * + * If the process associated with bfqq does slow I/O (e.g., because it + * issues random requests), we charge bfqq with the time it has been + * in service instead of the service it has received (see + * bfq_bfqq_charge_time for details on how this goal is achieved). As + * a consequence, bfqq will typically get higher timestamps upon + * reactivation, and hence it will be rescheduled as if it had + * received more service than what it has actually received. In the + * end, bfqq receives less service in proportion to how slowly its + * associated process consumes its budgets (and hence how seriously it + * tends to lower the throughput). In addition, this time-charging + * strategy guarantees time fairness among slow processes. In + * contrast, if the process associated with bfqq is not slow, we + * charge bfqq exactly with the service it has received. + * + * Charging time to the first type of queues and the exact service to + * the other has the effect of using the WF2Q+ policy to schedule the + * former on a timeslice basis, without violating service domain + * guarantees among the latter. + */ +static void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason) +{ + bool slow; + unsigned long delta = 0; + struct bfq_entity *entity = &bfqq->entity; + int ref; + + BUG_ON(bfqq != bfqd->in_service_queue); + + /* + * Check whether the process is slow (see bfq_bfqq_is_slow). + */ + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); + + /* + * Increase service_from_backlogged before next statement, + * because the possible next invocation of + * bfq_bfqq_charge_time would likely inflate + * entity->service. In contrast, service_from_backlogged must + * contain real service, to enable the soft real-time + * heuristic to correctly compute the bandwidth consumed by + * bfqq. + */ + bfqq->service_from_backlogged += entity->service; + + /* + * As above explained, charge slow (typically seeky) and + * timed-out queues with the time and not the service + * received, to favor sequential workloads. + * + * Processes doing I/O in the slower disk zones will tend to + * be slow(er) even if not seeky. Therefore, since the + * estimated peak rate is actually an average over the disk + * surface, these processes may timeout just for bad luck. To + * avoid punishing them, do not charge time to processes that + * succeeded in consuming at least 2/3 of their budget. This + * allows BFQ to preserve enough elasticity to still perform + * bandwidth, and not time, distribution with little unlucky + * or quasi-sequential processes. + */ + if (bfqq->wr_coeff == 1 && + (slow || + (reason == BFQ_BFQQ_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) + bfq_bfqq_charge_time(bfqd, bfqq, delta); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + if (reason == BFQ_BFQQ_TOO_IDLE && + entity->service <= 2 * entity->budget / 10) + bfq_clear_bfqq_IO_bound(bfqq); + + if (bfqd->low_latency && bfqq->wr_coeff == 1) + bfqq->last_wr_start_finish = jiffies; + + if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * If we get here, and there are no outstanding + * requests, then the request pattern is isochronous + * (see the comments on the function + * bfq_bfqq_softrt_next_start()). Thus we can compute + * soft_rt_next_start. If, instead, the queue still + * has outstanding requests, then we have to wait for + * the completion of all the outstanding requests to + * discover whether the request pattern is actually + * isochronous. + */ + BUG_ON(bfqd->busy_queues < 1); + if (bfqq->dispatched == 0) { + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", + bfqq->soft_rt_next_start); + } else { + /* + * The application is still waiting for the + * completion of one or more requests: + * prevent it from possibly being incorrectly + * deemed as soft real-time by setting its + * soft_rt_next_start to infinity. In fact, + * without this assignment, the application + * would be incorrectly deemed as soft + * real-time if: + * 1) it issued a new request before the + * completion of all its in-flight + * requests, and + * 2) at that time, its soft_rt_next_start + * happened to be in the past. + */ + bfqq->soft_rt_next_start = + bfq_greatest_from_now(); + /* + * Schedule an update of soft_rt_next_start to when + * the task may be discovered to be isochronous. + */ + bfq_mark_bfqq_softrt_update(bfqq); + } + } + + bfq_log_bfqq(bfqd, bfqq, + "expire (%d, slow %d, num_disp %d, short_ttime %d, weight %d)", + reason, slow, bfqq->dispatched, + bfq_bfqq_has_short_ttime(bfqq), entity->weight); + + /* + * Increase, decrease or leave budget unchanged according to + * reason. + */ + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); + ref = bfqq->ref; + __bfq_bfqq_expire(bfqd, bfqq); + + BUG_ON(ref > 1 && + !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && + !bfq_class_idle(bfqq)); + + /* mark bfqq as waiting a request only if a bic still points to it */ + if (ref > 1 && !bfq_bfqq_busy(bfqq) && + reason != BFQ_BFQQ_BUDGET_TIMEOUT && + reason != BFQ_BFQQ_BUDGET_EXHAUSTED) + bfq_mark_bfqq_non_blocking_wait_rq(bfqq); +} + +/* + * Budget timeout is not implemented through a dedicated timer, but + * just checked on request arrivals and completions, as well as on + * idle timer expirations. + */ +static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +{ + return time_is_before_eq_jiffies(bfqq->budget_timeout); +} + +/* + * If we expire a queue that is actively waiting (i.e., with the + * device idled) for the arrival of a new request, then we may incur + * the timestamp misalignment problem described in the body of the + * function __bfq_activate_entity. Hence we return true only if this + * condition does not hold, or if the queue is slow enough to deserve + * only to be kicked off for preserving a high throughput. + */ +static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, + "may_budget_timeout: wait_request %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); + + return (!bfq_bfqq_wait_request(bfqq) || + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) + && + bfq_bfqq_budget_timeout(bfqq); +} + +/* + * For a queue that becomes empty, device idling is allowed only if + * this function returns true for that queue. As a consequence, since + * device idling plays a critical role for both throughput boosting + * and service guarantees, the return value of this function plays a + * critical role as well. + * + * In a nutshell, this function returns true only if idling is + * beneficial for throughput or, even if detrimental for throughput, + * idling is however necessary to preserve service guarantees (low + * latency, desired throughput distribution, ...). In particular, on + * NCQ-capable devices, this function tries to return false, so as to + * help keep the drives' internal queues full, whenever this helps the + * device boost the throughput without causing any service-guarantee + * issue. + * + * In more detail, the return value of this function is obtained by, + * first, computing a number of boolean variables that take into + * account throughput and service-guarantee issues, and, then, + * combining these variables in a logical expression. Most of the + * issues taken into account are not trivial. We discuss these issues + * while introducing the variables. + */ +static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + bool rot_without_queueing = + !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, + bfqq_sequential_and_IO_bound, + idling_boosts_thr, idling_boosts_thr_without_issues, + idling_needed_for_service_guarantees, + asymmetric_scenario; + + if (bfqd->strict_guarantees) + return true; + + /* + * Idling is performed only if slice_idle > 0. In addition, we + * do not idle if + * (a) bfqq is async + * (b) bfqq is in the idle io prio class: in this case we do + * not idle because we want to minimize the bandwidth that + * queues in this class can steal to higher-priority queues + */ + if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || + bfq_class_idle(bfqq)) + return false; + + bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && + bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); + /* + * The next variable takes into account the cases where idling + * boosts the throughput. + * + * The value of the variable is computed considering, first, that + * idling is virtually always beneficial for the throughput if: + * (a) the device is not NCQ-capable and rotational, or + * (b) regardless of the presence of NCQ, the device is rotational and + * the request pattern for bfqq is I/O-bound and sequential, or + * (c) regardless of whether it is rotational, the device is + * not NCQ-capable and the request pattern for bfqq is + * I/O-bound and sequential. + * + * Secondly, and in contrast to the above item (b), idling an + * NCQ-capable flash-based device would not boost the + * throughput even with sequential I/O; rather it would lower + * the throughput in proportion to how fast the device + * is. Accordingly, the next variable is true if any of the + * above conditions (a), (b) or (c) is true, and, in + * particular, happens to be false if bfqd is an NCQ-capable + * flash-based device. + */ + idling_boosts_thr = rot_without_queueing || + ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && + bfqq_sequential_and_IO_bound); + + /* + * The value of the next variable, + * idling_boosts_thr_without_issues, is equal to that of + * idling_boosts_thr, unless a special case holds. In this + * special case, described below, idling may cause problems to + * weight-raised queues. + * + * When the request pool is saturated (e.g., in the presence + * of write hogs), if the processes associated with + * non-weight-raised queues ask for requests at a lower rate, + * then processes associated with weight-raised queues have a + * higher probability to get a request from the pool + * immediately (or at least soon) when they need one. Thus + * they have a higher probability to actually get a fraction + * of the device throughput proportional to their high + * weight. This is especially true with NCQ-capable drives, + * which enqueue several requests in advance, and further + * reorder internally-queued requests. + * + * For this reason, we force to false the value of + * idling_boosts_thr_without_issues if there are weight-raised + * busy queues. In this case, and if bfqq is not weight-raised, + * this guarantees that the device is not idled for bfqq (if, + * instead, bfqq is weight-raised, then idling will be + * guaranteed by another variable, see below). Combined with + * the timestamping rules of BFQ (see [1] for details), this + * behavior causes bfqq, and hence any sync non-weight-raised + * queue, to get a lower number of requests served, and thus + * to ask for a lower number of requests from the request + * pool, before the busy weight-raised queues get served + * again. This often mitigates starvation problems in the + * presence of heavy write workloads and NCQ, thereby + * guaranteeing a higher application and system responsiveness + * in these hostile scenarios. + */ + idling_boosts_thr_without_issues = idling_boosts_thr && + bfqd->wr_busy_queues == 0; + + /* + * There is then a case where idling must be performed not + * for throughput concerns, but to preserve service + * guarantees. + * + * To introduce this case, we can note that allowing the drive + * to enqueue more than one request at a time, and hence + * delegating de facto final scheduling decisions to the + * drive's internal scheduler, entails loss of control on the + * actual request service order. In particular, the critical + * situation is when requests from different processes happen + * to be present, at the same time, in the internal queue(s) + * of the drive. In such a situation, the drive, by deciding + * the service order of the internally-queued requests, does + * determine also the actual throughput distribution among + * these processes. But the drive typically has no notion or + * concern about per-process throughput distribution, and + * makes its decisions only on a per-request basis. Therefore, + * the service distribution enforced by the drive's internal + * scheduler is likely to coincide with the desired + * device-throughput distribution only in a completely + * symmetric scenario where: + * (i) each of these processes must get the same throughput as + * the others; + * (ii) all these processes have the same I/O pattern + * (either sequential or random). + * In fact, in such a scenario, the drive will tend to treat + * the requests of each of these processes in about the same + * way as the requests of the others, and thus to provide + * each of these processes with about the same throughput + * (which is exactly the desired throughput distribution). In + * contrast, in any asymmetric scenario, device idling is + * certainly needed to guarantee that bfqq receives its + * assigned fraction of the device throughput (see [1] for + * details). + * + * We address this issue by controlling, actually, only the + * symmetry sub-condition (i), i.e., provided that + * sub-condition (i) holds, idling is not performed, + * regardless of whether sub-condition (ii) holds. In other + * words, only if sub-condition (i) holds, then idling is + * allowed, and the device tends to be prevented from queueing + * many requests, possibly of several processes. The reason + * for not controlling also sub-condition (ii) is that we + * exploit preemption to preserve guarantees in case of + * symmetric scenarios, even if (ii) does not hold, as + * explained in the next two paragraphs. + * + * Even if a queue, say Q, is expired when it remains idle, Q + * can still preempt the new in-service queue if the next + * request of Q arrives soon (see the comments on + * bfq_bfqq_update_budg_for_activation). If all queues and + * groups have the same weight, this form of preemption, + * combined with the hole-recovery heuristic described in the + * comments on function bfq_bfqq_update_budg_for_activation, + * are enough to preserve a correct bandwidth distribution in + * the mid term, even without idling. In fact, even if not + * idling allows the internal queues of the device to contain + * many requests, and thus to reorder requests, we can rather + * safely assume that the internal scheduler still preserves a + * minimum of mid-term fairness. The motivation for using + * preemption instead of idling is that, by not idling, + * service guarantees are preserved without minimally + * sacrificing throughput. In other words, both a high + * throughput and its desired distribution are obtained. + * + * More precisely, this preemption-based, idleless approach + * provides fairness in terms of IOPS, and not sectors per + * second. This can be seen with a simple example. Suppose + * that there are two queues with the same weight, but that + * the first queue receives requests of 8 sectors, while the + * second queue receives requests of 1024 sectors. In + * addition, suppose that each of the two queues contains at + * most one request at a time, which implies that each queue + * always remains idle after it is served. Finally, after + * remaining idle, each queue receives very quickly a new + * request. It follows that the two queues are served + * alternatively, preempting each other if needed. This + * implies that, although both queues have the same weight, + * the queue with large requests receives a service that is + * 1024/8 times as high as the service received by the other + * queue. + * + * On the other hand, device idling is performed, and thus + * pure sector-domain guarantees are provided, for the + * following queues, which are likely to need stronger + * throughput guarantees: weight-raised queues, and queues + * with a higher weight than other queues. When such queues + * are active, sub-condition (i) is false, which triggers + * device idling. + * + * According to the above considerations, the next variable is + * true (only) if sub-condition (i) holds. To compute the + * value of this variable, we not only use the return value of + * the function bfq_symmetric_scenario(), but also check + * whether bfqq is being weight-raised, because + * bfq_symmetric_scenario() does not take into account also + * weight-raised queues (see comments on + * bfq_weights_tree_add()). + * + * As a side note, it is worth considering that the above + * device-idling countermeasures may however fail in the + * following unlucky scenario: if idling is (correctly) + * disabled in a time period during which all symmetry + * sub-conditions hold, and hence the device is allowed to + * enqueue many requests, but at some later point in time some + * sub-condition stops to hold, then it may become impossible + * to let requests be served in the desired order until all + * the requests already queued in the device have been served. + */ + asymmetric_scenario = bfqq->wr_coeff > 1 || + !bfq_symmetric_scenario(bfqd); + + /* + * Finally, there is a case where maximizing throughput is the + * best choice even if it may cause unfairness toward + * bfqq. Such a case is when bfqq became active in a burst of + * queue activations. Queues that became active during a large + * burst benefit only from throughput, as discussed in the + * comments on bfq_handle_burst. Thus, if bfqq became active + * in a burst and not idling the device maximizes throughput, + * then the device must no be idled, because not idling the + * device provides bfqq and all other queues in the burst with + * maximum benefit. Combining this and the above case, we can + * now establish when idling is actually needed to preserve + * service guarantees. + */ + idling_needed_for_service_guarantees = + asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); + + /* + * We have now all the components we need to compute the + * return value of the function, which is true only if idling + * either boosts the throughput (without issues), or is + * necessary to preserve service guarantees. + */ + bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", + bfq_bfqq_sync(bfqq), idling_boosts_thr); + + bfq_log_bfqq(bfqd, bfqq, + "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", + bfqd->wr_busy_queues, + idling_boosts_thr_without_issues, + bfq_bfqq_IO_bound(bfqq), + idling_needed_for_service_guarantees); + + return idling_boosts_thr_without_issues || + idling_needed_for_service_guarantees; +} + +/* + * If the in-service queue is empty but the function bfq_bfqq_may_idle + * returns true, then: + * 1) the queue must remain in service and cannot be expired, and + * 2) the device must be idled to wait for the possible arrival of a new + * request for the queue. + * See the comments on the function bfq_bfqq_may_idle for the reasons + * why performing device idling is the best choice to boost the throughput + * and preserve service guarantees when bfq_bfqq_may_idle itself + * returns true. + */ +static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) +{ + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_may_idle(bfqq); +} + +/* + * Select a queue for service. If we have a current queue in service, + * check whether to continue servicing it, or retrieve and set a new one. + */ +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + struct request *next_rq; + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; + + bfqq = bfqd->in_service_queue; + if (!bfqq) + goto new_queue; + + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + + if (bfq_may_expire_for_budg_timeout(bfqq) && + !hrtimer_active(&bfqd->idle_slice_timer) && + !bfq_bfqq_must_idle(bfqq)) + goto expire; + +check_queue: + /* + * This loop is rarely executed more than once. Even when it + * happens, it is much more convenient to re-execute this loop + * than to return NULL and trigger a new dispatch to get a + * request served. + */ + next_rq = bfqq->next_rq; + /* + * If bfqq has requests queued and it has enough budget left to + * serve them, keep the queue, otherwise expire it. + */ + if (next_rq) { + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + + if (bfq_serv_to_charge(next_rq, bfqq) > + bfq_bfqq_budget_left(bfqq)) { + /* + * Expire the queue for budget exhaustion, + * which makes sure that the next budget is + * enough to serve the next request, even if + * it comes from the fifo expired path. + */ + reason = BFQ_BFQQ_BUDGET_EXHAUSTED; + goto expire; + } else { + /* + * The idle timer may be pending because we may + * not disable disk idling even when a new request + * arrives. + */ + if (bfq_bfqq_wait_request(bfqq)) { + BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); + /* + * If we get here: 1) at least a new request + * has arrived but we have not disabled the + * timer because the request was too small, + * 2) then the block layer has unplugged + * the device, causing the dispatch to be + * invoked. + * + * Since the device is unplugged, now the + * requests are probably large enough to + * provide a reasonable throughput. + * So we disable idling. + */ + bfq_clear_bfqq_wait_request(bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + } + goto keep_queue; + } + } + + /* + * No requests pending. However, if the in-service queue is idling + * for a new request, or has requests waiting for a completion and + * may idle after their completion, then keep it anyway. + */ + if (hrtimer_active(&bfqd->idle_slice_timer) || + (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { + bfqq = NULL; + goto keep_queue; + } + + reason = BFQ_BFQQ_NO_MORE_REQUESTS; +expire: + bfq_bfqq_expire(bfqd, bfqq, false, reason); +new_queue: + bfqq = bfq_set_in_service_queue(bfqd); + if (bfqq) { + bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); + goto check_queue; + } +keep_queue: + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); + else + bfq_log(bfqd, "select_queue: no queue returned"); + + return bfqq; +} + +static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_after_jiffies(bfqq->last_wr_start_finish)); + + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + + BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != + entity->orig_weight * bfqq->wr_coeff); + if (entity->prio_changed) + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); + + /* + * If the queue was activated in a burst, or too much + * time has elapsed from the beginning of this + * weight-raising period, then end weight raising. + */ + if (bfq_bfqq_in_large_burst(bfqq)) + bfq_bfqq_end_wr(bfqq); + else if (time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || + time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) + bfq_bfqq_end_wr(bfqq); + else { + /* switch back to interactive wr */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = + bfqq->wr_start_at_switch_to_srt; + BUG_ON(time_is_after_jiffies( + bfqq->last_wr_start_finish)); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, + "back to interactive wr"); + } + } + } + /* + * To improve latency (for this or other queues), immediately + * update weight both if it must be raised and if it must be + * lowered. Since, entity may be on some active tree here, and + * might have a pending change of its ioprio class, invoke + * next function with the last parameter unset (see the + * comments on the function). + */ + if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) + __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), + entity, false); +} + +/* + * Dispatch one request from bfqq, moving it to the request queue + * dispatch list. + */ +static int bfq_dispatch_request(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + int dispatched = 0; + struct request *rq = bfqq->next_rq; + unsigned long service_to_charge; + + BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); + BUG_ON(!rq); + service_to_charge = bfq_serv_to_charge(rq, bfqq); + + BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + bfq_bfqq_served(bfqq, service_to_charge); + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + bfq_dispatch_insert(bfqd->queue, rq); + + /* + * If weight raising has to terminate for bfqq, then next + * function causes an immediate update of bfqq's weight, + * without waiting for next activation. As a consequence, on + * expiration, bfqq will be timestamped as if has never been + * weight-raised during this service slot, even if it has + * received part or even most of the service as a + * weight-raised queue. This inflates bfqq's timestamps, which + * is beneficial, as bfqq is then more willing to leave the + * device immediately to possible other weight-raised queues. + */ + bfq_update_wr_data(bfqd, bfqq); + + bfq_log_bfqq(bfqd, bfqq, + "dispatched %u sec req (%llu), budg left %d", + blk_rq_sectors(rq), + (unsigned long long) blk_rq_pos(rq), + bfq_bfqq_budget_left(bfqq)); + + dispatched++; + + if (!bfqd->in_service_bic) { + atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); + bfqd->in_service_bic = RQ_BIC(rq); + } + + if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) + goto expire; + + return dispatched; + +expire: + bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); + return dispatched; +} + +static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) +{ + int dispatched = 0; + + while (bfqq->next_rq) { + bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); + dispatched++; + } + + BUG_ON(!list_empty(&bfqq->fifo)); + return dispatched; +} + +/* + * Drain our current requests. + * Used for barriers and when switching io schedulers on-the-fly. + */ +static int bfq_forced_dispatch(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq, *n; + struct bfq_service_tree *st; + int dispatched = 0; + + bfqq = bfqd->in_service_queue; + if (bfqq) + __bfq_bfqq_expire(bfqd, bfqq); + + /* + * Loop through classes, and be careful to leave the scheduler + * in a consistent state, as feedback mechanisms and vtime + * updates cannot be disabled during the process. + */ + list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { + st = bfq_entity_service_tree(&bfqq->entity); + + dispatched += __bfq_forced_dispatch_bfqq(bfqq); + + bfqq->max_budget = bfq_max_budget(bfqd); + bfq_forget_idle(st); + } + + BUG_ON(bfqd->busy_queues != 0); + + return dispatched; +} + +static int bfq_dispatch_requests(struct request_queue *q, int force) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + + bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); + + if (bfqd->busy_queues == 0) + return 0; + + if (unlikely(force)) + return bfq_forced_dispatch(bfqd); + + /* + * Force device to serve one request at a time if + * strict_guarantees is true. Forcing this service scheme is + * currently the ONLY way to guarantee that the request + * service order enforced by the scheduler is respected by a + * queueing device. Otherwise the device is free even to make + * some unlucky request wait for as long as the device + * wishes. + * + * Of course, serving one request at at time may cause loss of + * throughput. + */ + if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + return 0; + + bfqq = bfq_select_queue(bfqd); + if (!bfqq) + return 0; + + BUG_ON(bfqq->entity.budget < bfqq->entity.service); + + BUG_ON(bfq_bfqq_wait_request(bfqq)); + + if (!bfq_dispatch_request(bfqd, bfqq)) + return 0; + + bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", + bfq_bfqq_sync(bfqq) ? "sync" : "async"); + + BUG_ON(bfqq->next_rq == NULL && + bfqq->entity.budget < bfqq->entity.service); + return 1; +} + +/* + * Task holds one reference to the queue, dropped when task exits. Each rq + * in-flight on this queue also holds a reference, dropped when rq is freed. + * + * Queue lock must be held here. Recall not to use bfqq after calling + * this function on it. + */ +static void bfq_put_queue(struct bfq_queue *bfqq) +{ +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + struct bfq_group *bfqg = bfqq_group(bfqq); +#endif + + BUG_ON(bfqq->ref <= 0); + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); + bfqq->ref--; + if (bfqq->ref) + return; + + BUG_ON(rb_first(&bfqq->sort_list)); + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); + BUG_ON(bfqq->entity.tree); + BUG_ON(bfq_bfqq_busy(bfqq)); + + if (bfq_bfqq_sync(bfqq)) + /* + * The fact that this queue is being destroyed does not + * invalidate the fact that this queue may have been + * activated during the current burst. As a consequence, + * although the queue does not exist anymore, and hence + * needs to be removed from the burst list if there, + * the burst size has not to be decremented. + */ + hlist_del_init(&bfqq->burst_list_node); + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); + + kmem_cache_free(bfq_pool, bfqq); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + bfqg_put(bfqg); +#endif +} + +static void bfq_put_cooperator(struct bfq_queue *bfqq) +{ + struct bfq_queue *__bfqq, *next; + + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { + if (__bfqq == bfqq) + break; + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; + } +} + +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + if (bfqq == bfqd->in_service_queue) { + __bfq_bfqq_expire(bfqd, bfqq); + bfq_schedule_dispatch(bfqd); + } + + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); /* release process reference */ +} + +static void bfq_init_icq(struct io_cq *icq) +{ + icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); +} + +static void bfq_exit_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + + if (bic_to_bfqq(bic, false)) { + bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); + bic_set_bfqq(bic, NULL, false); + } + + if (bic_to_bfqq(bic, true)) { + /* + * If the bic is using a shared queue, put the reference + * taken on the io_context when the bic started using a + * shared bfq_queue. + */ + if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) + put_io_context(icq->ioc); + bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); + bic_set_bfqq(bic, NULL, true); + } +} + +/* + * Update the entity prio values; note that the new values will not + * be used until the next (re)activation. + */ +static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + struct task_struct *tsk = current; + int ioprio_class; + + ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + switch (ioprio_class) { + default: + dev_err(bfqq->bfqd->queue->backing_dev_info->dev, + "bfq: bad prio class %d\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* + * No prio set, inherit CPU scheduling settings. + */ + bfqq->new_ioprio = task_nice_ioprio(tsk); + bfqq->new_ioprio_class = task_nice_ioclass(tsk); + break; + case IOPRIO_CLASS_RT: + bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio_class = IOPRIO_CLASS_RT; + break; + case IOPRIO_CLASS_BE: + bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_IDLE: + bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->new_ioprio = 7; + break; + } + + if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", + bfqq->new_ioprio); + BUG(); + } + + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfqq->entity.prio_changed = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "set_next_ioprio_data: bic_class %d prio %d class %d", + ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); +} + +static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) +{ + struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_queue *bfqq; + unsigned long uninitialized_var(flags); + int ioprio = bic->icq.ioc->ioprio; + + /* + * This condition may trigger on a newly created bic, be sure to + * drop the lock before returning. + */ + if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) + return; + + bic->ioprio = ioprio; + + bfqq = bic_to_bfqq(bic, false); + if (bfqq) { + /* release process reference on this queue */ + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); + bic_set_bfqq(bic, bfqq, false); + bfq_log_bfqq(bfqd, bfqq, + "check_ioprio_change: bfqq %p %d", + bfqq, bfqq->ref); + } + + bfqq = bic_to_bfqq(bic, true); + if (bfqq) + bfq_set_next_ioprio_data(bfqq, bic); +} + +static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, pid_t pid, int is_sync) +{ + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); + INIT_HLIST_NODE(&bfqq->burst_list_node); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + + bfqq->ref = 0; + bfqq->bfqd = bfqd; + + if (bic) + bfq_set_next_ioprio_data(bfqq, bic); + + if (is_sync) { + /* + * No need to mark as has_short_ttime if in + * idle_class, because no device idling is performed + * for queues in idle class + */ + if (!bfq_class_idle(bfqq)) + /* tentatively mark as has_short_ttime */ + bfq_mark_bfqq_has_short_ttime(bfqq); + bfq_mark_bfqq_sync(bfqq); + bfq_mark_bfqq_just_created(bfqq); + } else + bfq_clear_bfqq_sync(bfqq); + bfq_mark_bfqq_IO_bound(bfqq); + + /* Tentative initial value to trade off between thr and lat */ + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; + bfqq->pid = pid; + + bfqq->wr_coeff = 1; + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); + bfqq->budget_timeout = bfq_smallest_from_now(); + bfqq->split_time = bfq_smallest_from_now(); + + /* + * Set to the value for which bfqq will not be deemed as + * soft rt when it becomes backlogged. + */ + bfqq->soft_rt_next_start = bfq_greatest_from_now(); + + /* first request is almost certainly seeky */ + bfqq->seek_history = 1; +} + +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int ioprio_class, int ioprio) +{ + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + return &bfqg->async_bfqq[0][ioprio]; + case IOPRIO_CLASS_NONE: + ioprio = IOPRIO_NORM; + /* fall through */ + case IOPRIO_CLASS_BE: + return &bfqg->async_bfqq[1][ioprio]; + case IOPRIO_CLASS_IDLE: + return &bfqg->async_idle_bfqq; + default: + BUG(); + } +} + +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic) +{ + const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + struct bfq_queue **async_bfqq = NULL; + struct bfq_queue *bfqq; + struct bfq_group *bfqg; + + rcu_read_lock(); + + bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); + if (!bfqg) { + bfqq = &bfqd->oom_bfqq; + goto out; + } + + if (!is_sync) { + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, + ioprio); + bfqq = *async_bfqq; + if (bfqq) + goto out; + } + + bfqq = kmem_cache_alloc_node(bfq_pool, + GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, + bfqd->queue->node); + + if (bfqq) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + goto out; + } + + /* + * Pin the queue now that it's allocated, scheduler exit will + * prune it. + */ + if (async_bfqq) { + bfqq->ref++; /* + * Extra group reference, w.r.t. sync + * queue. This extra reference is removed + * only if bfqq->bfqg disappears, to + * guarantee that this queue is not freed + * until its group goes away. + */ + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", + bfqq, bfqq->ref); + *async_bfqq = bfqq; + } + +out: + bfqq->ref++; /* get a process reference to this queue */ + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); + rcu_read_unlock(); + return bfqq; +} + +static void bfq_update_io_thinktime(struct bfq_data *bfqd, + struct bfq_io_cq *bic) +{ + struct bfq_ttime *ttime = &bic->ttime; + u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; + + elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); + + ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); + ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, + ttime->ttime_samples); +} + +static void +bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + bfqq->seek_history <<= 1; + bfqq->seek_history |= + get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && + (!blk_queue_nonrot(bfqd->queue) || + blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); +} + +static void bfq_update_has_short_ttime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + bool has_short_ttime = true; + + /* + * No need to update has_short_ttime if bfqq is async or in + * idle io prio class, or if bfq_slice_idle is zero, because + * no device idling is performed for bfqq in this case. + */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || + bfqd->bfq_slice_idle == 0) + return; + + /* Idle window just restored, statistics are meaningless. */ + if (time_is_after_eq_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) + return; + + /* Think time is infinite if no process is linked to + * bfqq. Otherwise check average think time to + * decide whether to mark as has_short_ttime + */ + if (atomic_read(&bic->icq.ioc->active_ref) == 0 || + (bfq_sample_valid(bic->ttime.ttime_samples) && + bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) + has_short_ttime = false; + + bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", + has_short_ttime); + + if (has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); + else + bfq_clear_bfqq_has_short_ttime(bfqq); +} + +/* + * Called when a new fs request (rq) is added to bfqq. Check if there's + * something we should do about it. + */ +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + struct bfq_io_cq *bic = RQ_BIC(rq); + + if (rq->cmd_flags & REQ_META) + bfqq->meta_pending++; + + bfq_update_io_thinktime(bfqd, bic); + bfq_update_has_short_ttime(bfqd, bfqq, bic); + bfq_update_io_seektime(bfqd, bfqq, rq); + + bfq_log_bfqq(bfqd, bfqq, + "rq_enqueued: has_short_ttime=%d (seeky %d)", + bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + + if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { + bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && + blk_rq_sectors(rq) < 32; + bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); + + /* + * There is just this request queued: if the request + * is small and the queue is not to be expired, then + * just exit. + * + * In this way, if the device is being idled to wait + * for a new request from the in-service queue, we + * avoid unplugging the device and committing the + * device to serve just a small request. On the + * contrary, we wait for the block layer to decide + * when to unplug the device: hopefully, new requests + * will be merged to this one quickly, then the device + * will be unplugged and larger requests will be + * dispatched. + */ + if (small_req && !budget_timeout) + return; + + /* + * A large enough request arrived, or the queue is to + * be expired: in both cases disk idling is to be + * stopped, so clear wait_request flag and reset + * timer. + */ + bfq_clear_bfqq_wait_request(bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + + /* + * The queue is not empty, because a new request just + * arrived. Hence we can safely expire the queue, in + * case of budget timeout, without risking that the + * timestamps of the queue are not updated correctly. + * See [1] for more details. + */ + if (budget_timeout) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_BUDGET_TIMEOUT); + + /* + * Let the request rip immediately, or let a new queue be + * selected if bfqq has just been expired. + */ + __blk_run_queue(bfqd->queue); + } +} + +static void bfq_insert_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + + assert_spin_locked(bfqd->queue->queue_lock); + + /* + * An unplug may trigger a requeue of a request from the device + * driver: make sure we are in process context while trying to + * merge two bfq_queues. + */ + if (!in_interrupt()) { + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + if (new_bfqq) { + if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) + new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); + /* + * Release the request's reference to the old bfqq + * and make sure one is taken to the shared queue. + */ + new_bfqq->allocated[rq_data_dir(rq)]++; + bfqq->allocated[rq_data_dir(rq)]--; + new_bfqq->ref++; + bfq_clear_bfqq_just_created(bfqq); + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); + /* + * rq is about to be enqueued into new_bfqq, + * release rq reference on bfqq + */ + bfq_put_queue(bfqq); + rq->elv.priv[1] = new_bfqq; + bfqq = new_bfqq; + } + } + + bfq_add_request(rq); + + rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + list_add_tail(&rq->queuelist, &bfqq->fifo); + + bfq_rq_enqueued(bfqd, bfqq, rq); +} + +static void bfq_update_hw_tag(struct bfq_data *bfqd) +{ + bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, + bfqd->rq_in_driver); + + if (bfqd->hw_tag == 1) + return; + + /* + * This sample is valid if the number of outstanding requests + * is large enough to allow a queueing behavior. Note that the + * sum is not exact, as it's not taking into account deactivated + * requests. + */ + if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) + return; + + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) + return; + + bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; + bfqd->max_rq_in_driver = 0; + bfqd->hw_tag_samples = 0; +} + +static void bfq_completed_request(struct request_queue *q, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + u64 now_ns; + u32 delta_us; + + bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", + blk_rq_sectors(rq)); + + assert_spin_locked(bfqd->queue->queue_lock); + bfq_update_hw_tag(bfqd); + + BUG_ON(!bfqd->rq_in_driver); + BUG_ON(!bfqq->dispatched); + bfqd->rq_in_driver--; + bfqq->dispatched--; + bfqg_stats_update_completion(bfqq_group(bfqq), + rq_start_time_ns(rq), + rq_io_start_time_ns(rq), + rq->cmd_flags); + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + /* + * Set budget_timeout (which we overload to store the + * time at which the queue remains with no backlog and + * no outstanding request; used by the weight-raising + * mechanism). + */ + bfqq->budget_timeout = jiffies; + + bfq_weights_tree_remove(bfqd, &bfqq->entity, + &bfqd->queue_weights_tree); + } + + now_ns = ktime_get_ns(); + + RQ_BIC(rq)->ttime.last_end_request = now_ns; + + /* + * Using us instead of ns, to get a reasonable precision in + * computing rate in next check. + */ + delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); + + bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", + delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, + (USEC_PER_SEC* + (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, + (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); + + /* + * If the request took rather long to complete, and, according + * to the maximum request size recorded, this completion latency + * implies that the request was certainly served at a very low + * rate (less than 1M sectors/sec), then the whole observation + * interval that lasts up to this time instant cannot be a + * valid time interval for computing a new peak rate. Invoke + * bfq_update_rate_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - reset to zero samples, which will trigger a proper + * re-initialization of the observation interval on next + * dispatch + */ + if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && + (bfqd->last_rq_max_size<last_completion = now_ns; + + /* + * If we are waiting to discover whether the request pattern + * of the task associated with the queue is actually + * isochronous, and both requisites for this condition to hold + * are now satisfied, then compute soft_rt_next_start (see the + * comments on the function bfq_bfqq_softrt_next_start()). We + * schedule this delayed check when bfqq expires, if it still + * has in-flight requests. + */ + if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + + /* + * If this is the in-service queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ + if (bfqd->in_service_queue == bfqq) { + if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { + bfq_arm_slice_timer(bfqd); + goto out; + } else if (bfq_may_expire_for_budg_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_BUDGET_TIMEOUT); + else if (RB_EMPTY_ROOT(&bfqq->sort_list) && + (bfqq->dispatched == 0 || + !bfq_bfqq_may_idle(bfqq))) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQ_BFQQ_NO_MORE_REQUESTS); + } + + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + +out: + return; +} + +static int __bfq_may_queue(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { + bfq_clear_bfqq_must_alloc(bfqq); + return ELV_MQUEUE_MUST; + } + + return ELV_MQUEUE_MAY; +} + +static int bfq_may_queue(struct request_queue *q, unsigned int op) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct task_struct *tsk = current; + struct bfq_io_cq *bic; + struct bfq_queue *bfqq; + + /* + * Don't force setup of a queue from here, as a call to may_queue + * does not necessarily imply that a request actually will be + * queued. So just lookup a possibly existing queue, or return + * 'may queue' if that fails. + */ + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (!bic) + return ELV_MQUEUE_MAY; + + bfqq = bic_to_bfqq(bic, op_is_sync(op)); + if (bfqq) + return __bfq_may_queue(bfqq); + + return ELV_MQUEUE_MAY; +} + +/* + * Queue lock held here. + */ +static void bfq_put_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + if (bfqq) { + const int rw = rq_data_dir(rq); + + BUG_ON(!bfqq->allocated[rw]); + bfqq->allocated[rw]--; + + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", + bfqq, bfqq->ref); + bfq_put_queue(bfqq); + } +} + +/* + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this + * was the last process referring to that bfqq. + */ +static struct bfq_queue * +bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + + put_io_context(bic->icq.ioc); + + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; + bfq_clear_bfqq_coop(bfqq); + bfq_clear_bfqq_split_coop(bfqq); + return bfqq; + } + + bic_set_bfqq(bic, NULL, 1); + + bfq_put_cooperator(bfqq); + + bfq_put_queue(bfqq); + return NULL; +} + +/* + * Allocate bfq data structures associated with this request. + */ +static int bfq_set_request(struct request_queue *q, struct request *rq, + struct bio *bio, gfp_t gfp_mask) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); + const int rw = rq_data_dir(rq); + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + unsigned long flags; + bool bfqq_already_existing = false, split = false; + + spin_lock_irqsave(q->queue_lock, flags); + + if (!bic) + goto queue_fail; + + bfq_check_ioprio_change(bic, bio); + + bfq_bic_update_cgroup(bic, bio); + +new_queue: + bfqq = bic_to_bfqq(bic, is_sync); + if (!bfqq || bfqq == &bfqd->oom_bfqq) { + if (bfqq) + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { + bfq_log_bfqq(bfqd, bfqq, + "set_request: was_in_list %d " + "was_in_large_burst %d " + "large burst in progress %d", + bic->was_in_burst_list, + bic->saved_in_large_burst, + bfqd->large_burst); + + if ((bic->was_in_burst_list && bfqd->large_burst) || + bic->saved_in_large_burst) { + bfq_log_bfqq(bfqd, bfqq, + "set_request: marking in " + "large burst"); + bfq_mark_bfqq_in_large_burst(bfqq); + } else { + bfq_log_bfqq(bfqd, bfqq, + "set_request: clearing in " + "large burst"); + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); + } + bfqq->split_time = jiffies; + } + } else { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + + /* Update bic before losing reference to bfqq */ + if (bfq_bfqq_in_large_burst(bfqq)) + bic->saved_in_large_burst = true; + + bfqq = bfq_split_bfqq(bic, bfqq); + split = true; + if (!bfqq) + goto new_queue; + else + bfqq_already_existing = true; + } + } + + bfqq->allocated[rw]++; + bfqq->ref++; + bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; + + /* + * If a bfq_queue has only one process reference, it is owned + * by only one bfq_io_cq: we can set the bic field of the + * bfq_queue to the address of that structure. Also, if the + * queue has just been split, mark a flag so that the + * information is available to the other scheduler hooks. + */ + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; + if (split) { + /* + * If the queue has just been split from a shared + * queue, restore the idle window and the possible + * weight raising period. + */ + bfq_bfqq_resume_state(bfqq, bfqd, bic, + bfqq_already_existing); + } + } + + if (unlikely(bfq_bfqq_just_created(bfqq))) + bfq_handle_burst(bfqd, bfqq); + + spin_unlock_irqrestore(q->queue_lock, flags); + + return 0; + +queue_fail: + bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(q->queue_lock, flags); + + return 1; +} + +static void bfq_kick_queue(struct work_struct *work) +{ + struct bfq_data *bfqd = + container_of(work, struct bfq_data, unplug_work); + struct request_queue *q = bfqd->queue; + + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); +} + +/* + * Handler of the expiration of the timer running if the in-service queue + * is idling inside its time slice. + */ +static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) +{ + struct bfq_data *bfqd = container_of(timer, struct bfq_data, + idle_slice_timer); + struct bfq_queue *bfqq; + unsigned long flags; + enum bfqq_expiration reason; + + spin_lock_irqsave(bfqd->queue->queue_lock, flags); + + bfqq = bfqd->in_service_queue; + /* + * Theoretical race here: the in-service queue can be NULL or + * different from the queue that was idling if the timer handler + * spins on the queue_lock and a new request arrives for the + * current queue and there is a full dispatch cycle that changes + * the in-service queue. This can hardly happen, but in the worst + * case we just expire a queue too early. + */ + if (bfqq) { + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); + bfq_clear_bfqq_wait_request(bfqq); + + if (bfq_bfqq_budget_timeout(bfqq)) + /* + * Also here the queue can be safely expired + * for budget timeout without wasting + * guarantees + */ + reason = BFQ_BFQQ_BUDGET_TIMEOUT; + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) + /* + * The queue may not be empty upon timer expiration, + * because we may not disable the timer when the + * first request of the in-service queue arrives + * during disk idling. + */ + reason = BFQ_BFQQ_TOO_IDLE; + else + goto schedule_dispatch; + + bfq_bfqq_expire(bfqd, bfqq, true, reason); + } + +schedule_dispatch: + bfq_schedule_dispatch(bfqd); + + spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); + return HRTIMER_NORESTART; +} + +static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) +{ + hrtimer_cancel(&bfqd->idle_slice_timer); + cancel_work_sync(&bfqd->unplug_work); +} + +static void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_queue **bfqq_ptr) +{ + struct bfq_group *root_group = bfqd->root_group; + struct bfq_queue *bfqq = *bfqq_ptr; + + bfq_log(bfqd, "put_async_bfqq: %p", bfqq); + if (bfqq) { + bfq_bfqq_move(bfqd, bfqq, root_group); + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", + bfqq, bfqq->ref); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; + } +} + +/* + * Release all the bfqg references to its async queues. If we are + * deallocating the group these queues may still contain requests, so + * we reparent them to the root cgroup (i.e., the only one that will + * exist for sure until all the requests on a device are gone). + */ +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); +} + +static void bfq_exit_queue(struct elevator_queue *e) +{ + struct bfq_data *bfqd = e->elevator_data; + struct request_queue *q = bfqd->queue; + struct bfq_queue *bfqq, *n; + + bfq_shutdown_timer_wq(bfqd); + + spin_lock_irq(q->queue_lock); + + BUG_ON(bfqd->in_service_queue); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, false, false); + + spin_unlock_irq(q->queue_lock); + + bfq_shutdown_timer_wq(bfqd); + + BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + blkcg_deactivate_policy(q, &blkcg_policy_bfq); +#else + bfq_put_async_queues(bfqd, bfqd->root_group); + kfree(bfqd->root_group); +#endif + + kfree(bfqd); +} + +static void bfq_init_root_group(struct bfq_group *root_group, + struct bfq_data *bfqd) +{ + int i; + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + root_group->entity.parent = NULL; + root_group->my_entity = NULL; + root_group->bfqd = bfqd; +#endif + root_group->rq_pos_tree = RB_ROOT; + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + root_group->sched_data.bfq_class_idle_last_service = jiffies; +} + +static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) +{ + struct bfq_data *bfqd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; + + bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); + if (!bfqd) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = bfqd; + + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + bfqd->oom_bfqq.ref++; + bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; + bfqd->oom_bfqq.entity.new_weight = + bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); + + /* oom_bfqq does not participate to bursts */ + bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); + /* + * Trigger weight initialization, according to ioprio, at the + * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio + * class won't be changed any more. + */ + bfqd->oom_bfqq.entity.prio_changed = 1; + + bfqd->queue = q; + + spin_lock_irq(q->queue_lock); + q->elevator = eq; + spin_unlock_irq(q->queue_lock); + + bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); + if (!bfqd->root_group) + goto out_free; + bfq_init_root_group(bfqd->root_group, bfqd); + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + + hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + + bfqd->queue_weights_tree = RB_ROOT; + bfqd->group_weights_tree = RB_ROOT; + + INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); + + INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->idle_list); + INIT_HLIST_HEAD(&bfqd->burst_list); + + bfqd->hw_tag = -1; + + bfqd->bfq_max_budget = bfq_default_max_budget; + + bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; + bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; + bfqd->bfq_back_max = bfq_back_max; + bfqd->bfq_back_penalty = bfq_back_penalty; + bfqd->bfq_slice_idle = bfq_slice_idle; + bfqd->bfq_timeout = bfq_timeout; + + bfqd->bfq_requests_within_timer = 120; + + bfqd->bfq_large_burst_thresh = 8; + bfqd->bfq_burst_interval = msecs_to_jiffies(180); + + bfqd->low_latency = true; + + /* + * Trade-off between responsiveness and fairness. + */ + bfqd->bfq_wr_coeff = 30; + bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); + bfqd->bfq_wr_max_time = 0; + bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); + bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); + bfqd->bfq_wr_max_softrt_rate = 7000; /* + * Approximate rate required + * to playback or record a + * high-definition compressed + * video. + */ + bfqd->wr_busy_queues = 0; + + /* + * Begin by assuming, optimistically, that the device is a + * high-speed one, and that its peak rate is equal to 2/3 of + * the highest reference rate. + */ + bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * + T_fast[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + bfqd->device_speed = BFQ_BFQD_FAST; + + return 0; + +out_free: + kfree(bfqd); + kobject_put(&eq->kobj); + return -ENOMEM; +} + +static void bfq_slab_kill(void) +{ + kmem_cache_destroy(bfq_pool); +} + +static int __init bfq_slab_setup(void) +{ + bfq_pool = KMEM_CACHE(bfq_queue, 0); + if (!bfq_pool) + return -ENOMEM; + return 0; +} + +static ssize_t bfq_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%u\n", var); +} + +static ssize_t bfq_var_store(unsigned long *var, const char *page, + size_t count) +{ + unsigned long new_val; + int ret = kstrtoul(page, 10, &new_val); + + if (ret == 0) + *var = new_val; + + return count; +} + +static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) +{ + struct bfq_data *bfqd = e->elevator_data; + + return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? + jiffies_to_msecs(bfqd->bfq_wr_max_time) : + jiffies_to_msecs(bfq_wr_duration(bfqd))); +} + +static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) +{ + struct bfq_queue *bfqq; + struct bfq_data *bfqd = e->elevator_data; + ssize_t num_char = 0; + + num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", + bfqd->queued); + + spin_lock_irq(bfqd->queue->queue_lock); + + num_char += sprintf(page + num_char, "Active:\n"); + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, nr_queued %d %d, ", + bfqq->pid, + bfqq->entity.weight, + bfqq->queued[0], + bfqq->queued[1]); + num_char += sprintf(page + num_char, + "dur %d/%u\n", + jiffies_to_msecs( + jiffies - + bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + num_char += sprintf(page + num_char, "Idle:\n"); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { + num_char += sprintf(page + num_char, + "pid%d: weight %hu, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + jiffies_to_msecs(jiffies - + bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + spin_unlock_irq(bfqd->queue->queue_lock); + + return num_char; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + u64 __data = __VAR; \ + if (__CONV == 1) \ + __data = jiffies_to_msecs(__data); \ + else if (__CONV == 2) \ + __data = div_u64(__data, NSEC_PER_MSEC); \ + return bfq_var_show(__data, (page)); \ +} +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); +SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); +SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); +SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); +SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); +SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, + 1); +SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); +#undef SHOW_FUNCTION + +#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + u64 __data = __VAR; \ + __data = div_u64(__data, NSEC_PER_USEC); \ + return bfq_var_show(__data, (page)); \ +} +USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); +#undef USEC_SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t \ +__FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long uninitialized_var(__data); \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV == 1) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else if (__CONV == 2) \ + *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ + else \ + *(__PTR) = __data; \ + return ret; \ +} +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, + INT_MAX, 2); +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, + INT_MAX, 2); +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, + INT_MAX, 0); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); +STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); +STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, + 1); +STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, + INT_MAX, 1); +STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, + &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); +STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, + INT_MAX, 0); +#undef STORE_FUNCTION + +#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long uninitialized_var(__data); \ + int ret = bfq_var_store(&__data, (page), count); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + *(__PTR) = (u64)__data * NSEC_PER_USEC; \ + return ret; \ +} +USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, + UINT_MAX); +#undef USEC_STORE_FUNCTION + +/* do nothing for the moment */ +static ssize_t bfq_weights_store(struct elevator_queue *e, + const char *page, size_t count) +{ + return count; +} + +static ssize_t bfq_max_budget_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data == 0) + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); + else { + if (__data > INT_MAX) + __data = INT_MAX; + bfqd->bfq_max_budget = __data; + } + + bfqd->bfq_user_max_budget = __data; + + return ret; +} + +/* + * Leaving this name to preserve name compatibility with cfq + * parameters, but this timeout is used for both sync and async. + */ +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data < 1) + __data = 1; + else if (__data > INT_MAX) + __data = INT_MAX; + + bfqd->bfq_timeout = msecs_to_jiffies(__data); + if (bfqd->bfq_user_max_budget == 0) + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); + + return ret; +} + +static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (!bfqd->strict_guarantees && __data == 1 + && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) + bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; + + bfqd->strict_guarantees = __data; + + return ret; +} + +static ssize_t bfq_low_latency_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long uninitialized_var(__data); + int ret = bfq_var_store(&__data, (page), count); + + if (__data > 1) + __data = 1; + if (__data == 0 && bfqd->low_latency != 0) + bfq_end_wr(bfqd); + bfqd->low_latency = __data; + + return ret; +} + +#define BFQ_ATTR(name) \ + __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) + +static struct elv_fs_entry bfq_attrs[] = { + BFQ_ATTR(fifo_expire_sync), + BFQ_ATTR(fifo_expire_async), + BFQ_ATTR(back_seek_max), + BFQ_ATTR(back_seek_penalty), + BFQ_ATTR(slice_idle), + BFQ_ATTR(slice_idle_us), + BFQ_ATTR(max_budget), + BFQ_ATTR(timeout_sync), + BFQ_ATTR(strict_guarantees), + BFQ_ATTR(low_latency), + BFQ_ATTR(wr_coeff), + BFQ_ATTR(wr_max_time), + BFQ_ATTR(wr_rt_max_time), + BFQ_ATTR(wr_min_idle_time), + BFQ_ATTR(wr_min_inter_arr_async), + BFQ_ATTR(wr_max_softrt_rate), + BFQ_ATTR(weights), + __ATTR_NULL +}; + +static struct elevator_type iosched_bfq = { + .ops.sq = { + .elevator_merge_fn = bfq_merge, + .elevator_merged_fn = bfq_merged_request, + .elevator_merge_req_fn = bfq_merged_requests, +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + .elevator_bio_merged_fn = bfq_bio_merged, +#endif + .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, + .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, + .elevator_dispatch_fn = bfq_dispatch_requests, + .elevator_add_req_fn = bfq_insert_request, + .elevator_activate_req_fn = bfq_activate_request, + .elevator_deactivate_req_fn = bfq_deactivate_request, + .elevator_completed_req_fn = bfq_completed_request, + .elevator_former_req_fn = elv_rb_former_request, + .elevator_latter_req_fn = elv_rb_latter_request, + .elevator_init_icq_fn = bfq_init_icq, + .elevator_exit_icq_fn = bfq_exit_icq, + .elevator_set_req_fn = bfq_set_request, + .elevator_put_req_fn = bfq_put_request, + .elevator_may_queue_fn = bfq_may_queue, + .elevator_init_fn = bfq_init_queue, + .elevator_exit_fn = bfq_exit_queue, + }, + .icq_size = sizeof(struct bfq_io_cq), + .icq_align = __alignof__(struct bfq_io_cq), + .elevator_attrs = bfq_attrs, + .elevator_name = "bfq-sq", + .elevator_owner = THIS_MODULE, +}; + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +static struct blkcg_policy blkcg_policy_bfq = { + .dfl_cftypes = bfq_blkg_files, + .legacy_cftypes = bfq_blkcg_legacy_files, + + .cpd_alloc_fn = bfq_cpd_alloc, + .cpd_init_fn = bfq_cpd_init, + .cpd_bind_fn = bfq_cpd_init, + .cpd_free_fn = bfq_cpd_free, + + .pd_alloc_fn = bfq_pd_alloc, + .pd_init_fn = bfq_pd_init, + .pd_offline_fn = bfq_pd_offline, + .pd_free_fn = bfq_pd_free, + .pd_reset_stats_fn = bfq_pd_reset_stats, +}; +#endif + +static int __init bfq_init(void) +{ + int ret; + char msg[60] = "BFQ I/O-scheduler: v8r12"; + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + ret = blkcg_policy_register(&blkcg_policy_bfq); + if (ret) + return ret; +#endif + + ret = -ENOMEM; + if (bfq_slab_setup()) + goto err_pol_unreg; + + /* + * Times to load large popular applications for the typical + * systems installed on the reference devices (see the + * comments before the definitions of the next two + * arrays). Actually, we use slightly slower values, as the + * estimated peak rate tends to be smaller than the actual + * peak rate. The reason for this last fact is that estimates + * are computed over much shorter time intervals than the long + * intervals typically used for benchmarking. Why? First, to + * adapt more quickly to variations. Second, because an I/O + * scheduler cannot rely on a peak-rate-evaluation workload to + * be run for a long time. + */ + T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ + T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ + T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ + T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ + + /* + * Thresholds that determine the switch between speed classes + * (see the comments before the definition of the array + * device_speed_thresh). These thresholds are biased towards + * transitions to the fast class. This is safer than the + * opposite bias. In fact, a wrong transition to the slow + * class results in short weight-raising periods, because the + * speed of the device then tends to be higher that the + * reference peak rate. On the opposite end, a wrong + * transition to the fast class tends to increase + * weight-raising periods, because of the opposite reason. + */ + device_speed_thresh[0] = (4 * R_slow[0]) / 3; + device_speed_thresh[1] = (4 * R_slow[1]) / 3; + + ret = elv_register(&iosched_bfq); + if (ret) + goto err_pol_unreg; + +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + strcat(msg, " (with cgroups support)"); +#endif + pr_info("%s", msg); + + return 0; + +err_pol_unreg: +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + blkcg_policy_unregister(&blkcg_policy_bfq); +#endif + return ret; +} + +static void __exit bfq_exit(void) +{ + elv_unregister(&iosched_bfq); +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED + blkcg_policy_unregister(&blkcg_policy_bfq); +#endif + bfq_slab_kill(); +} + +module_init(bfq_init); +module_exit(bfq_exit); + +MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); +MODULE_LICENSE("GPL"); From e24d2e6461479dbd13d58be2dc44b23b5e24487c Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 19 Dec 2016 17:13:39 +0100 Subject: [PATCH 07/51] Add config and build bits for bfq-mq-iosched Signed-off-by: Paolo Valente --- block/Kconfig.iosched | 10 +++++++++ block/Makefile | 1 + block/bfq-cgroup-included.c | 4 ++-- block/bfq-mq-iosched.c | 25 ++++++++++++----------- block/bfq-sched.c | 50 ++++++++++++++++++++++----------------------- block/bfq-sq-iosched.c | 24 +++++++++++----------- block/bfq.h | 36 +++++++++++++++++++++----------- 8 files changed, 88 insertions(+), 64 deletions(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 9e3f4c2f7390..2d94af3d8b0a 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -96,6 +96,16 @@ config DEFAULT_IOSCHED default "bfq-sq" if DEFAULT_BFQ_SQ default "noop" if DEFAULT_NOOP +config MQ_IOSCHED_BFQ + tristate "BFQ-MQ I/O Scheduler" + default y + ---help--- + BFQ I/O scheduler for BLK-MQ. BFQ-MQ distributes bandwidth + among all processes according to their weights, regardless of + the device parameters and with any workload. It also + guarantees a low latency to interactive and soft real-time + applications. Details in Documentation/block/bfq-iosched.txt + config MQ_IOSCHED_DEADLINE tristate "MQ deadline I/O scheduler" default y diff --git a/block/Makefile b/block/Makefile index 59026b425791..a571329c23f0 100644 --- a/block/Makefile +++ b/block/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o +obj-$(CONFIG_MQ_IOSCHED_BFQ) += bfq-mq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c index af7c216a3540..9c483b658179 100644 --- a/block/bfq-cgroup-included.c +++ b/block/bfq-cgroup-included.c @@ -15,7 +15,7 @@ * file. */ -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED /* bfqg stats flags */ enum bfqg_stats_flags { @@ -1116,7 +1116,7 @@ static struct cftype bfq_blkg_files[] = { {} /* terminate */ }; -#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ +#else /* BFQ_GROUP_IOSCHED_ENABLED */ static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, unsigned int op) { } diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 30d019fc67e0..e88e00f1e0a7 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -82,6 +82,7 @@ #include #include #include "blk.h" +#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ #include "bfq.h" /* Expiration time of sync (0) and async (1) requests, in ns. */ @@ -387,7 +388,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && (bfqd->queue_weights_tree.rb_node->rb_left || bfqd->queue_weights_tree.rb_node->rb_right) -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED ) || (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && (bfqd->group_weights_tree.rb_node->rb_left || @@ -1672,7 +1673,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, } } -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED static void bfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { @@ -3879,7 +3880,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) */ static void bfq_put_queue(struct bfq_queue *bfqq) { -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED struct bfq_group *bfqg = bfqq_group(bfqq); #endif @@ -3909,7 +3910,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); kmem_cache_free(bfq_pool, bfqq); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED bfqg_put(bfqg); #endif } @@ -4835,7 +4836,7 @@ static void bfq_exit_queue(struct elevator_queue *e) BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED blkcg_deactivate_policy(q, &blkcg_policy_bfq); #else bfq_put_async_queues(bfqd, bfqd->root_group); @@ -4850,7 +4851,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, { int i; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED root_group->entity.parent = NULL; root_group->my_entity = NULL; root_group->bfqd = bfqd; @@ -5265,7 +5266,7 @@ static struct elevator_type iosched_bfq = { .elevator_merge_fn = bfq_merge, .elevator_merged_fn = bfq_merged_request, .elevator_merge_req_fn = bfq_merged_requests, -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED .elevator_bio_merged_fn = bfq_bio_merged, #endif .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, @@ -5292,7 +5293,7 @@ static struct elevator_type iosched_bfq = { .elevator_owner = THIS_MODULE, }; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED static struct blkcg_policy blkcg_policy_bfq = { .dfl_cftypes = bfq_blkg_files, .legacy_cftypes = bfq_blkcg_legacy_files, @@ -5315,7 +5316,7 @@ static int __init bfq_init(void) int ret; char msg[60] = "BFQ I/O-scheduler: v8r12"; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED ret = blkcg_policy_register(&blkcg_policy_bfq); if (ret) return ret; @@ -5362,7 +5363,7 @@ static int __init bfq_init(void) if (ret) goto err_pol_unreg; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED strcat(msg, " (with cgroups support)"); #endif pr_info("%s", msg); @@ -5370,7 +5371,7 @@ static int __init bfq_init(void) return 0; err_pol_unreg: -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED blkcg_policy_unregister(&blkcg_policy_bfq); #endif return ret; @@ -5379,7 +5380,7 @@ static int __init bfq_init(void) static void __exit bfq_exit(void) { elv_unregister(&iosched_bfq); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED blkcg_policy_unregister(&blkcg_policy_bfq); #endif bfq_slab_kill(); diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 5c0f9290a79c..b54a638186e3 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -136,7 +136,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, if (bfqq) bfq_log_bfqq(bfqq->bfqd, bfqq, "update_next_in_service: chosen this queue"); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED else { struct bfq_group *bfqg = container_of(next_in_service, @@ -149,7 +149,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, return parent_sched_may_change; } -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED /* both next loops stop at one of the child entities of the root group */ #define for_each_entity(entity) \ for (; entity ; entity = entity->parent) @@ -243,7 +243,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) return false; } -#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ +#else /* BFQ_GROUP_IOSCHED_ENABLED */ #define for_each_entity(entity) \ for (; entity ; entity = NULL) @@ -260,7 +260,7 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) return true; } -#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ +#endif /* BFQ_GROUP_IOSCHED_ENABLED */ /* * Shift for timestamp calculations. This actually limits the maximum @@ -323,7 +323,7 @@ static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_finish: start %llu, finish %llu, delta %llu", start, finish, delta); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED } else { struct bfq_group *bfqg = container_of(entity, struct bfq_group, entity); @@ -473,7 +473,7 @@ static void bfq_update_active_node(struct rb_node *node) bfq_log_bfqq(bfqq->bfqd, bfqq, "update_active_node: new min_start %llu", ((entity->min_start>>10)*1000)>>12); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED } else { struct bfq_group *bfqg = container_of(entity, struct bfq_group, entity); @@ -540,7 +540,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node = &entity->rb_node; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED struct bfq_sched_data *sd = NULL; struct bfq_group *bfqg = NULL; struct bfq_data *bfqd = NULL; @@ -555,7 +555,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, bfq_update_active_tree(node); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED sd = entity->sched_data; bfqg = container_of(sd, struct bfq_group, sched_data); BUG_ON(!bfqg); @@ -563,7 +563,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, #endif if (bfqq) list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED else { /* bfq_group */ BUG_ON(!bfqd); bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); @@ -652,7 +652,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED struct bfq_sched_data *sd = NULL; struct bfq_group *bfqg = NULL; struct bfq_data *bfqd = NULL; @@ -664,7 +664,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, if (node) bfq_update_active_tree(node); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED sd = entity->sched_data; bfqg = container_of(sd, struct bfq_group, sched_data); BUG_ON(!bfqg); @@ -672,7 +672,7 @@ static void bfq_active_extract(struct bfq_service_tree *st, #endif if (bfqq) list_del(&bfqq->bfqq_list); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED else { /* bfq_group */ BUG_ON(!bfqd); bfq_weights_tree_remove(bfqd, entity, @@ -809,14 +809,14 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, unsigned int prev_weight, new_weight; struct bfq_data *bfqd = NULL; struct rb_root *root; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED struct bfq_sched_data *sd; struct bfq_group *bfqg; #endif if (bfqq) bfqd = bfqq->bfqd; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED else { sd = entity->my_sched_data; bfqg = container_of(sd, struct bfq_group, sched_data); @@ -907,7 +907,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, return new_st; } -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); #endif @@ -936,7 +936,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) st->vtime += bfq_delta(served, st->wsum); bfq_forget_idle(st); } -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); #endif st = bfq_entity_service_tree(&bfqq->entity); @@ -1060,7 +1060,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, bfq_log_bfqq(bfqq->bfqd, bfqq, "__activate_entity: new queue finish %llu", ((entity->finish>>10)*1000)>>12); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED } else { struct bfq_group *bfqg = container_of(entity, struct bfq_group, entity); @@ -1078,7 +1078,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, bfq_log_bfqq(bfqq->bfqd, bfqq, "__activate_entity: queue %seligible in st %p", entity->start <= st->vtime ? "" : "non ", st); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED } else { struct bfq_group *bfqg = container_of(entity, struct bfq_group, entity); @@ -1153,7 +1153,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity, BUG_ON(entity->on_st && bfqq); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED if (entity->on_st && !bfqq) { struct bfq_group *bfqg = container_of(entity, struct bfq_group, @@ -1485,7 +1485,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, if (bfqq) bfq_log_bfqq(bfqq->bfqd, bfqq, "invoking udpdate_next for this queue"); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED else { struct bfq_group *bfqg = container_of(entity, @@ -1525,7 +1525,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_vtime_jump: new value %llu", root_entity->min_start); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED else { struct bfq_group *bfqg = container_of(root_entity, struct bfq_group, @@ -1661,7 +1661,7 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service "__lookup_next: start %llu vtime %llu st %p", ((entity->start>>10)*1000)>>12, ((new_vtime>>10)*1000)>>12, st); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED else { struct bfq_group *bfqg = container_of(entity, struct bfq_group, entity); @@ -1735,7 +1735,7 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) if (bfqq) bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", st + class_idx, class_idx); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED else { struct bfq_group *bfqg = container_of(entity, struct bfq_group, entity); @@ -1777,7 +1777,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) */ sd = &bfqd->root_group->sched_data; for (; sd ; sd = entity->my_sched_data) { -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED if (entity) { struct bfq_group *bfqg = container_of(entity, struct bfq_group, entity); @@ -1867,7 +1867,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) bfq_log_bfqq(bfqd, bfqq, "get_next_queue: this queue, finish %llu", (((entity->finish>>10)*1000)>>10)>>2); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED else { struct bfq_group *bfqg = container_of(entity, struct bfq_group, entity); diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c index 30d019fc67e0..25da0d1c0622 100644 --- a/block/bfq-sq-iosched.c +++ b/block/bfq-sq-iosched.c @@ -387,7 +387,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && (bfqd->queue_weights_tree.rb_node->rb_left || bfqd->queue_weights_tree.rb_node->rb_right) -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED ) || (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && (bfqd->group_weights_tree.rb_node->rb_left || @@ -1672,7 +1672,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, } } -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED static void bfq_bio_merged(struct request_queue *q, struct request *req, struct bio *bio) { @@ -3879,7 +3879,7 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) */ static void bfq_put_queue(struct bfq_queue *bfqq) { -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED struct bfq_group *bfqg = bfqq_group(bfqq); #endif @@ -3909,7 +3909,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); kmem_cache_free(bfq_pool, bfqq); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED bfqg_put(bfqg); #endif } @@ -4835,7 +4835,7 @@ static void bfq_exit_queue(struct elevator_queue *e) BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED blkcg_deactivate_policy(q, &blkcg_policy_bfq); #else bfq_put_async_queues(bfqd, bfqd->root_group); @@ -4850,7 +4850,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, { int i; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED root_group->entity.parent = NULL; root_group->my_entity = NULL; root_group->bfqd = bfqd; @@ -5265,7 +5265,7 @@ static struct elevator_type iosched_bfq = { .elevator_merge_fn = bfq_merge, .elevator_merged_fn = bfq_merged_request, .elevator_merge_req_fn = bfq_merged_requests, -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED .elevator_bio_merged_fn = bfq_bio_merged, #endif .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, @@ -5292,7 +5292,7 @@ static struct elevator_type iosched_bfq = { .elevator_owner = THIS_MODULE, }; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED static struct blkcg_policy blkcg_policy_bfq = { .dfl_cftypes = bfq_blkg_files, .legacy_cftypes = bfq_blkcg_legacy_files, @@ -5315,7 +5315,7 @@ static int __init bfq_init(void) int ret; char msg[60] = "BFQ I/O-scheduler: v8r12"; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED ret = blkcg_policy_register(&blkcg_policy_bfq); if (ret) return ret; @@ -5362,7 +5362,7 @@ static int __init bfq_init(void) if (ret) goto err_pol_unreg; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED strcat(msg, " (with cgroups support)"); #endif pr_info("%s", msg); @@ -5370,7 +5370,7 @@ static int __init bfq_init(void) return 0; err_pol_unreg: -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED blkcg_policy_unregister(&blkcg_policy_bfq); #endif return ret; @@ -5379,7 +5379,7 @@ static int __init bfq_init(void) static void __exit bfq_exit(void) { elv_unregister(&iosched_bfq); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED blkcg_policy_unregister(&blkcg_policy_bfq); #endif bfq_slab_kill(); diff --git a/block/bfq.h b/block/bfq.h index 34fc4697fd89..53954d1b87f8 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -19,6 +19,18 @@ #include #include +/* + * Define an alternative macro to compile cgroups support. This is one + * of the steps needed to let bfq-mq share the files bfq-sched.c and + * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro + * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether + * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not + * CONFIG_BFQ_GROUP_IOSCHED, is defined. + */ +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#define BFQ_GROUP_IOSCHED_ENABLED +#endif + #define BFQ_IOPRIO_CLASSES 3 #define BFQ_CL_IDLE_TIMEOUT (HZ/5) @@ -344,7 +356,7 @@ struct bfq_io_cq { struct bfq_ttime ttime; /* per (request_queue, blkcg) ioprio */ int ioprio; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif @@ -671,7 +683,7 @@ static const char *checked_dev_name(const struct device *dev) return nodev; } -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); @@ -696,7 +708,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); __pbuf, ##args); \ } while (0) -#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ +#else /* BFQ_GROUP_IOSCHED_ENABLED */ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ pr_crit("%s bfq%d%c " fmt "\n", \ @@ -705,7 +717,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ##args) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ +#endif /* BFQ_GROUP_IOSCHED_ENABLED */ #define bfq_log(bfqd, fmt, args...) \ pr_crit("%s bfq " fmt "\n", \ @@ -713,7 +725,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ##args) #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); @@ -735,7 +747,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ } while (0) -#else /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ +#else /* BFQ_GROUP_IOSCHED_ENABLED */ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ @@ -743,7 +755,7 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ##args) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -#endif /* CONFIG_BFQ_SQ_GROUP_IOSCHED */ +#endif /* BFQ_GROUP_IOSCHED_ENABLED */ #define bfq_log(bfqd, fmt, args...) \ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) @@ -763,7 +775,7 @@ enum bfqq_expiration { struct bfqg_stats { -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -794,7 +806,7 @@ struct bfqg_stats { #endif }; -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED /* * struct bfq_group_data - per-blkcg storage for the blkio subsystem. * @@ -895,7 +907,7 @@ bfq_entity_service_tree(struct bfq_entity *entity) bfq_log_bfqq(bfqq->bfqd, bfqq, "entity_service_tree %p %d", sched_data->service_tree + idx, idx); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED else { struct bfq_group *bfqg = container_of(entity, struct bfq_group, entity); @@ -924,7 +936,7 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) return bic->icq.q->elevator->elevator_data; } -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) { @@ -953,7 +965,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bfq_io_cq *bic); static void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#ifdef BFQ_GROUP_IOSCHED_ENABLED static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); #endif static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); From add91dbd756cf8ca3aa3add9a19eef742d5fca6b Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 20 Jan 2017 09:18:25 +0100 Subject: [PATCH 08/51] Increase max policies for io controller To let bfq-mq policy be plugged too (however cgroups suppport is not yet functional in bfq-mq). Signed-off-by: Paolo Valente --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bf000c58644b..10f892ca585d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -54,7 +54,7 @@ struct blk_stat_callback; * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ -#define BLKCG_MAX_POLS 4 +#define BLKCG_MAX_POLS 5 typedef void (rq_end_io_fn)(struct request *, blk_status_t); From 2c39a1d9ab4516d44e01e96f19f578b927e7f2e9 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 19 Dec 2016 18:11:33 +0100 Subject: [PATCH 09/51] Copy header file bfq.h as bfq-mq.h This commit introduces the header file bfq-mq.h, that will play for bfq-mq-iosched.c the same role that bfq.h plays for bfq-iosched.c. For the moment, the file bfq-mq.h is just a copy of bfq.h. Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 2 +- block/bfq-mq.h | 973 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 974 insertions(+), 1 deletion(-) create mode 100644 block/bfq-mq.h diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index e88e00f1e0a7..d1125aee658c 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -83,7 +83,7 @@ #include #include "blk.h" #undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ -#include "bfq.h" +#include "bfq-mq.h" /* Expiration time of sync (0) and async (1) requests, in ns. */ static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; diff --git a/block/bfq-mq.h b/block/bfq-mq.h new file mode 100644 index 000000000000..53954d1b87f8 --- /dev/null +++ b/block/bfq-mq.h @@ -0,0 +1,973 @@ +/* + * BFQ v8r12 for 4.11.0: data structures and common functions prototypes. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2015 Paolo Valente + * + * Copyright (C) 2017 Paolo Valente + */ + +#ifndef _BFQ_H +#define _BFQ_H + +#include +#include +#include + +/* + * Define an alternative macro to compile cgroups support. This is one + * of the steps needed to let bfq-mq share the files bfq-sched.c and + * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro + * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether + * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not + * CONFIG_BFQ_GROUP_IOSCHED, is defined. + */ +#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +#define BFQ_GROUP_IOSCHED_ENABLED +#endif + +#define BFQ_IOPRIO_CLASSES 3 +#define BFQ_CL_IDLE_TIMEOUT (HZ/5) + +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 +#define BFQ_WEIGHT_CONVERSION_COEFF 10 + +#define BFQ_DEFAULT_QUEUE_IOPRIO 4 + +#define BFQ_WEIGHT_LEGACY_DFL 100 +#define BFQ_DEFAULT_GRP_IOPRIO 0 +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE + +/* + * Soft real-time applications are extremely more latency sensitive + * than interactive ones. Over-raise the weight of the former to + * privilege them against the latter. + */ +#define BFQ_SOFTRT_WEIGHT_FACTOR 100 + +struct bfq_entity; + +/** + * struct bfq_service_tree - per ioprio_class service tree. + * + * Each service tree represents a B-WF2Q+ scheduler on its own. Each + * ioprio_class has its own independent scheduler, and so its own + * bfq_service_tree. All the fields are protected by the queue lock + * of the containing bfqd. + */ +struct bfq_service_tree { + /* tree for active entities (i.e., those backlogged) */ + struct rb_root active; + /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ + struct rb_root idle; + + struct bfq_entity *first_idle; /* idle entity with minimum F_i */ + struct bfq_entity *last_idle; /* idle entity with maximum F_i */ + + u64 vtime; /* scheduler virtual time */ + /* scheduler weight sum; active and idle entities contribute to it */ + unsigned long wsum; +}; + +/** + * struct bfq_sched_data - multi-class scheduler. + * + * bfq_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as an + * intermediate queue in a hierarchical setup. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. + * Requests from higher priority queues are served before all the + * requests from lower priority queues; among requests of the same + * queue requests are served according to B-WF2Q+. + * + * The schedule is implemented by the service trees, plus the field + * @next_in_service, which points to the entity on the active trees + * that will be served next, if 1) no changes in the schedule occurs + * before the current in-service entity is expired, 2) the in-service + * queue becomes idle when it expires, and 3) if the entity pointed by + * in_service_entity is not a queue, then the in-service child entity + * of the entity pointed by in_service_entity becomes idle on + * expiration. This peculiar definition allows for the following + * optimization, not yet exploited: while a given entity is still in + * service, we already know which is the best candidate for next + * service among the other active entitities in the same parent + * entity. We can then quickly compare the timestamps of the + * in-service entity with those of such best candidate. + * + * All the fields are protected by the queue lock of the containing + * bfqd. + */ +struct bfq_sched_data { + struct bfq_entity *in_service_entity; /* entity in service */ + /* head-of-the-line entity in the scheduler (see comments above) */ + struct bfq_entity *next_in_service; + /* array of service trees, one per ioprio_class */ + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; + /* last time CLASS_IDLE was served */ + unsigned long bfq_class_idle_last_service; + +}; + +/** + * struct bfq_weight_counter - counter of the number of all active entities + * with a given weight. + */ +struct bfq_weight_counter { + unsigned int weight; /* weight of the entities this counter refers to */ + unsigned int num_active; /* nr of active entities with this weight */ + /* + * Weights tree member (see bfq_data's @queue_weights_tree and + * @group_weights_tree) + */ + struct rb_node weights_node; +}; + +/** + * struct bfq_entity - schedulable entity. + * + * A bfq_entity is used to represent either a bfq_queue (leaf node in the + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each + * entity belongs to the sched_data of the parent group in the cgroup + * hierarchy. Non-leaf entities have also their own sched_data, stored + * in @my_sched_data. + * + * Each entity stores independently its priority values; this would + * allow different weights on different devices, but this + * functionality is not exported to userspace by now. Priorities and + * weights are updated lazily, first storing the new values into the + * new_* fields, then setting the @prio_changed flag. As soon as + * there is a transition in the entity state that allows the priority + * update to take place the effective and the requested priority + * values are synchronized. + * + * Unless cgroups are used, the weight value is calculated from the + * ioprio to export the same interface as CFQ. When dealing with + * ``well-behaved'' queues (i.e., queues that do not spend too much + * time to consume their budget and have true sequential behavior, and + * when there are no external factors breaking anticipation) the + * relative weights at each level of the cgroups hierarchy should be + * guaranteed. All the fields are protected by the queue lock of the + * containing bfqd. + */ +struct bfq_entity { + struct rb_node rb_node; /* service_tree member */ + /* pointer to the weight counter associated with this entity */ + struct bfq_weight_counter *weight_counter; + + /* + * Flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree) or is in service. + */ + bool on_st; + + u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ + u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ + + /* tree the entity is enqueued into; %NULL if not on a tree */ + struct rb_root *tree; + + /* + * minimum start time of the (active) subtree rooted at this + * entity; used for O(log N) lookups into active trees + */ + u64 min_start; + + /* amount of service received during the last service slot */ + int service; + + /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ + int budget; + + unsigned int weight; /* weight of the queue */ + unsigned int new_weight; /* next weight if a change is in progress */ + + /* original weight, used to implement weight boosting */ + unsigned int orig_weight; + + /* parent entity, for hierarchical scheduling */ + struct bfq_entity *parent; + + /* + * For non-leaf nodes in the hierarchy, the associated + * scheduler queue, %NULL on leaf nodes. + */ + struct bfq_sched_data *my_sched_data; + /* the scheduler queue this entity belongs to */ + struct bfq_sched_data *sched_data; + + /* flag, set to request a weight, ioprio or ioprio_class change */ + int prio_changed; +}; + +struct bfq_group; + +/** + * struct bfq_queue - leaf schedulable entity. + * + * A bfq_queue is a leaf request queue; it can be associated with an + * io_context or more, if it is async or shared between cooperating + * processes. @cgroup holds a reference to the cgroup, to be sure that it + * does not disappear while a bfqq still references it (mostly to avoid + * races between request issuing and task migration followed by cgroup + * destruction). + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_queue { + /* reference counter */ + int ref; + /* parent bfq_data */ + struct bfq_data *bfqd; + + /* current ioprio and ioprio class */ + unsigned short ioprio, ioprio_class; + /* next ioprio and ioprio class if a change is in progress */ + unsigned short new_ioprio, new_ioprio_class; + + /* + * Shared bfq_queue if queue is cooperating with one or more + * other queues. + */ + struct bfq_queue *new_bfqq; + /* request-position tree member (see bfq_group's @rq_pos_tree) */ + struct rb_node pos_node; + /* request-position tree root (see bfq_group's @rq_pos_tree) */ + struct rb_root *pos_root; + + /* sorted list of pending requests */ + struct rb_root sort_list; + /* if fifo isn't expired, next request to serve */ + struct request *next_rq; + /* number of sync and async requests queued */ + int queued[2]; + /* number of sync and async requests currently allocated */ + int allocated[2]; + /* number of pending metadata requests */ + int meta_pending; + /* fifo list of requests in sort_list */ + struct list_head fifo; + + /* entity representing this queue in the scheduler */ + struct bfq_entity entity; + + /* maximum budget allowed from the feedback mechanism */ + int max_budget; + /* budget expiration (in jiffies) */ + unsigned long budget_timeout; + + /* number of requests on the dispatch list or inside driver */ + int dispatched; + + unsigned int flags; /* status flags.*/ + + /* node for active/idle bfqq list inside parent bfqd */ + struct list_head bfqq_list; + + /* bit vector: a 1 for each seeky requests in history */ + u32 seek_history; + + /* node for the device's burst list */ + struct hlist_node burst_list_node; + + /* position of the last request enqueued */ + sector_t last_request_pos; + + /* Number of consecutive pairs of request completion and + * arrival, such that the queue becomes idle after the + * completion, but the next request arrives within an idle + * time slice; used only if the queue's IO_bound flag has been + * cleared. + */ + unsigned int requests_within_timer; + + /* pid of the process owning the queue, used for logging purposes */ + pid_t pid; + + /* + * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL + * if the queue is shared. + */ + struct bfq_io_cq *bic; + + /* current maximum weight-raising time for this queue */ + unsigned long wr_cur_max_time; + /* + * Minimum time instant such that, only if a new request is + * enqueued after this time instant in an idle @bfq_queue with + * no outstanding requests, then the task associated with the + * queue it is deemed as soft real-time (see the comments on + * the function bfq_bfqq_softrt_next_start()) + */ + unsigned long soft_rt_next_start; + /* + * Start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period. + */ + unsigned long last_wr_start_finish; + /* factor by which the weight of this queue is multiplied */ + unsigned int wr_coeff; + /* + * Time of the last transition of the @bfq_queue from idle to + * backlogged. + */ + unsigned long last_idle_bklogged; + /* + * Cumulative service received from the @bfq_queue since the + * last transition from idle to backlogged. + */ + unsigned long service_from_backlogged; + /* + * Value of wr start time when switching to soft rt + */ + unsigned long wr_start_at_switch_to_srt; + + unsigned long split_time; /* time of last split */ +}; + +/** + * struct bfq_ttime - per process thinktime stats. + */ +struct bfq_ttime { + u64 last_end_request; /* completion time of last request */ + + u64 ttime_total; /* total process thinktime */ + unsigned long ttime_samples; /* number of thinktime samples */ + u64 ttime_mean; /* average process thinktime */ + +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + */ +struct bfq_io_cq { + /* associated io_cq structure */ + struct io_cq icq; /* must be the first member */ + /* array of two process queues, the sync and the async */ + struct bfq_queue *bfqq[2]; + /* associated @bfq_ttime struct */ + struct bfq_ttime ttime; + /* per (request_queue, blkcg) ioprio */ + int ioprio; +#ifdef BFQ_GROUP_IOSCHED_ENABLED + uint64_t blkcg_serial_nr; /* the current blkcg serial */ +#endif + + /* + * Snapshot of the has_short_time flag before merging; taken + * to remember its value while the queue is merged, so as to + * be able to restore it in case of split. + */ + bool saved_has_short_ttime; + /* + * Same purpose as the previous two fields for the I/O bound + * classification of a queue. + */ + bool saved_IO_bound; + + /* + * Same purpose as the previous fields for the value of the + * field keeping the queue's belonging to a large burst + */ + bool saved_in_large_burst; + /* + * True if the queue belonged to a burst list before its merge + * with another cooperating queue. + */ + bool was_in_burst_list; + + /* + * Similar to previous fields: save wr information. + */ + unsigned long saved_wr_coeff; + unsigned long saved_last_wr_start_finish; + unsigned long saved_wr_start_at_switch_to_srt; + unsigned int saved_wr_cur_max_time; +}; + +enum bfq_device_speed { + BFQ_BFQD_FAST, + BFQ_BFQD_SLOW, +}; + +/** + * struct bfq_data - per-device data structure. + * + * All the fields are protected by the @queue lock. + */ +struct bfq_data { + /* request queue for the device */ + struct request_queue *queue; + + /* root bfq_group for the device */ + struct bfq_group *root_group; + + /* + * rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active and not + * weight-raised @bfq_queue (see the comments to the functions + * bfq_weights_tree_[add|remove] for further details). + */ + struct rb_root queue_weights_tree; + /* + * rbtree of non-queue @bfq_entity weight counters, sorted by + * weight. Used to keep track of whether all @bfq_groups have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active @bfq_group (see + * the comments to the functions bfq_weights_tree_[add|remove] + * for further details). + */ + struct rb_root group_weights_tree; + + /* + * Number of bfq_queues containing requests (including the + * queue in service, even if it is idling). + */ + int busy_queues; + /* number of weight-raised busy @bfq_queues */ + int wr_busy_queues; + /* number of queued requests */ + int queued; + /* number of requests dispatched and waiting for completion */ + int rq_in_driver; + + /* + * Maximum number of requests in driver in the last + * @hw_tag_samples completed requests. + */ + int max_rq_in_driver; + /* number of samples used to calculate hw_tag */ + int hw_tag_samples; + /* flag set to one if the driver is showing a queueing behavior */ + int hw_tag; + + /* number of budgets assigned */ + int budgets_assigned; + + /* + * Timer set when idling (waiting) for the next request from + * the queue in service. + */ + struct hrtimer idle_slice_timer; + /* delayed work to restart dispatching on the request queue */ + struct work_struct unplug_work; + + /* bfq_queue in service */ + struct bfq_queue *in_service_queue; + /* bfq_io_cq (bic) associated with the @in_service_queue */ + struct bfq_io_cq *in_service_bic; + + /* on-disk position of the last served request */ + sector_t last_position; + + /* time of last request completion (ns) */ + u64 last_completion; + + /* time of first rq dispatch in current observation interval (ns) */ + u64 first_dispatch; + /* time of last rq dispatch in current observation interval (ns) */ + u64 last_dispatch; + + /* beginning of the last budget */ + ktime_t last_budget_start; + /* beginning of the last idle slice */ + ktime_t last_idling_start; + + /* number of samples in current observation interval */ + int peak_rate_samples; + /* num of samples of seq dispatches in current observation interval */ + u32 sequential_samples; + /* total num of sectors transferred in current observation interval */ + u64 tot_sectors_dispatched; + /* max rq size seen during current observation interval (sectors) */ + u32 last_rq_max_size; + /* time elapsed from first dispatch in current observ. interval (us) */ + u64 delta_from_first; + /* current estimate of device peak rate */ + u32 peak_rate; + + /* maximum budget allotted to a bfq_queue before rescheduling */ + int bfq_max_budget; + + /* list of all the bfq_queues active on the device */ + struct list_head active_list; + /* list of all the bfq_queues idle on the device */ + struct list_head idle_list; + + /* + * Timeout for async/sync requests; when it fires, requests + * are served in fifo order. + */ + u64 bfq_fifo_expire[2]; + /* weight of backward seeks wrt forward ones */ + unsigned int bfq_back_penalty; + /* maximum allowed backward seek */ + unsigned int bfq_back_max; + /* maximum idling time */ + u32 bfq_slice_idle; + + /* user-configured max budget value (0 for auto-tuning) */ + int bfq_user_max_budget; + /* + * Timeout for bfq_queues to consume their budget; used to + * prevent seeky queues from imposing long latencies to + * sequential or quasi-sequential ones (this also implies that + * seeky queues cannot receive guarantees in the service + * domain; after a timeout they are charged for the time they + * have been in service, to preserve fairness among them, but + * without service-domain guarantees). + */ + unsigned int bfq_timeout; + + /* + * Number of consecutive requests that must be issued within + * the idle time slice to set again idling to a queue which + * was marked as non-I/O-bound (see the definition of the + * IO_bound flag for further details). + */ + unsigned int bfq_requests_within_timer; + + /* + * Force device idling whenever needed to provide accurate + * service guarantees, without caring about throughput + * issues. CAVEAT: this may even increase latencies, in case + * of useless idling for processes that did stop doing I/O. + */ + bool strict_guarantees; + + /* + * Last time at which a queue entered the current burst of + * queues being activated shortly after each other; for more + * details about this and the following parameters related to + * a burst of activations, see the comments on the function + * bfq_handle_burst. + */ + unsigned long last_ins_in_burst; + /* + * Reference time interval used to decide whether a queue has + * been activated shortly after @last_ins_in_burst. + */ + unsigned long bfq_burst_interval; + /* number of queues in the current burst of queue activations */ + int burst_size; + + /* common parent entity for the queues in the burst */ + struct bfq_entity *burst_parent_entity; + /* Maximum burst size above which the current queue-activation + * burst is deemed as 'large'. + */ + unsigned long bfq_large_burst_thresh; + /* true if a large queue-activation burst is in progress */ + bool large_burst; + /* + * Head of the burst list (as for the above fields, more + * details in the comments on the function bfq_handle_burst). + */ + struct hlist_head burst_list; + + /* if set to true, low-latency heuristics are enabled */ + bool low_latency; + /* + * Maximum factor by which the weight of a weight-raised queue + * is multiplied. + */ + unsigned int bfq_wr_coeff; + /* maximum duration of a weight-raising period (jiffies) */ + unsigned int bfq_wr_max_time; + + /* Maximum weight-raising duration for soft real-time processes */ + unsigned int bfq_wr_rt_max_time; + /* + * Minimum idle period after which weight-raising may be + * reactivated for a queue (in jiffies). + */ + unsigned int bfq_wr_min_idle_time; + /* + * Minimum period between request arrivals after which + * weight-raising may be reactivated for an already busy async + * queue (in jiffies). + */ + unsigned long bfq_wr_min_inter_arr_async; + + /* Max service-rate for a soft real-time queue, in sectors/sec */ + unsigned int bfq_wr_max_softrt_rate; + /* + * Cached value of the product R*T, used for computing the + * maximum duration of weight raising automatically. + */ + u64 RT_prod; + /* device-speed class for the low-latency heuristic */ + enum bfq_device_speed device_speed; + + /* fallback dummy bfqq for extreme OOM conditions */ + struct bfq_queue oom_bfqq; +}; + +enum bfqq_state_flags { + BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ + BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* + * waiting for a request + * without idling the device + */ + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ + BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_IO_bound, /* + * bfqq has timed-out at least once + * having consumed at most 2/10 of + * its budget + */ + BFQ_BFQQ_FLAG_in_large_burst, /* + * bfqq activated in a large burst, + * see comments to bfq_handle_burst. + */ + BFQ_BFQQ_FLAG_softrt_update, /* + * may need softrt-next-start + * update + */ + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ + BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ +}; + +#define BFQ_BFQQ_FNS(name) \ +static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ +} \ +static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ +{ \ + return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ +} + +BFQ_BFQQ_FNS(just_created); +BFQ_BFQQ_FNS(busy); +BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(non_blocking_wait_rq); +BFQ_BFQQ_FNS(must_alloc); +BFQ_BFQQ_FNS(fifo_expire); +BFQ_BFQQ_FNS(has_short_ttime); +BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(IO_bound); +BFQ_BFQQ_FNS(in_large_burst); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(softrt_update); +#undef BFQ_BFQQ_FNS + +/* Logging facilities. */ +#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE + +static const char *checked_dev_name(const struct device *dev) +{ + static const char nodev[] = "nodev"; + + if (dev) + return dev_name(dev); + + return nodev; +} + +#ifdef BFQ_GROUP_IOSCHED_ENABLED +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); +static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ + assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + pr_crit("%s bfq%d%c %s " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + __pbuf, ##args); \ +} while (0) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ + pr_crit("%s %s " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + __pbuf, ##args); \ +} while (0) + +#else /* BFQ_GROUP_IOSCHED_ENABLED */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + pr_crit("%s bfq%d%c " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + ##args) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +#endif /* BFQ_GROUP_IOSCHED_ENABLED */ + +#define bfq_log(bfqd, fmt, args...) \ + pr_crit("%s bfq " fmt "\n", \ + checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ + ##args) + +#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ +#ifdef BFQ_GROUP_IOSCHED_ENABLED +static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); +static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char __pbuf[128]; \ + \ + assert_spin_locked((bfqd)->queue->queue_lock); \ + blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ + (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + __pbuf, ##args); \ +} while (0) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + char __pbuf[128]; \ + \ + blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ +} while (0) + +#else /* BFQ_GROUP_IOSCHED_ENABLED */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + ##args) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +#endif /* BFQ_GROUP_IOSCHED_ENABLED */ + +#define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) +#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ + +/* Expiration reasons. */ +enum bfqq_expiration { + BFQ_BFQQ_TOO_IDLE = 0, /* + * queue has been idling for + * too long + */ + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ + BFQ_BFQQ_PREEMPTED /* preemption in progress */ +}; + + +struct bfqg_stats { +#ifdef BFQ_GROUP_IOSCHED_ENABLED + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ + struct blkg_rwstat service_time; + /* total time spent waiting in scheduler queue in ns */ + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; + /* total disk time and nr sectors dispatched by this group */ + struct blkg_stat time; + /* sum of number of ios queued across all samples */ + struct blkg_stat avg_queue_size_sum; + /* count of samples taken for average */ + struct blkg_stat avg_queue_size_samples; + /* how many times this group has been removed from service tree */ + struct blkg_stat dequeue; + /* total time spent waiting for it to be assigned a timeslice. */ + struct blkg_stat group_wait_time; + /* time spent idling for this blkcg_gq */ + struct blkg_stat idle_time; + /* total time with empty current active q with other requests queued */ + struct blkg_stat empty_time; + /* fields after this shouldn't be cleared on stat reset */ + uint64_t start_group_wait_time; + uint64_t start_idle_time; + uint64_t start_empty_time; + uint16_t flags; +#endif +}; + +#ifdef BFQ_GROUP_IOSCHED_ENABLED +/* + * struct bfq_group_data - per-blkcg storage for the blkio subsystem. + * + * @ps: @blkcg_policy_storage that this structure inherits + * @weight: weight of the bfq_group + */ +struct bfq_group_data { + /* must be the first member */ + struct blkcg_policy_data pd; + + unsigned int weight; +}; + +/** + * struct bfq_group - per (device, cgroup) data structure. + * @entity: schedulable entity to insert into the parent group sched_data. + * @sched_data: own sched_data, to contain child entities (they may be + * both bfq_queues and bfq_groups). + * @bfqd: the bfq_data for the device this group acts upon. + * @async_bfqq: array of async queues for all the tasks belonging to + * the group, one queue per ioprio value per ioprio_class, + * except for the idle class that has only one queue. + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). + * @my_entity: pointer to @entity, %NULL for the toplevel group; used + * to avoid too many special cases during group creation/ + * migration. + * @active_entities: number of active entities belonging to the group; + * unused for the root group. Used to know whether there + * are groups with more than one active @bfq_entity + * (see the comments to the function + * bfq_bfqq_may_idle()). + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_find_close_cooperator()). + * + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup + * there is a set of bfq_groups, each one collecting the lower-level + * entities belonging to the group that are acting on the same device. + * + * Locking works as follows: + * o @bfqd is protected by the queue lock, RCU is used to access it + * from the readers. + * o All the other fields are protected by the @bfqd queue lock. + */ +struct bfq_group { + /* must be the first member */ + struct blkg_policy_data pd; + + struct bfq_entity entity; + struct bfq_sched_data sched_data; + + void *bfqd; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct bfq_entity *my_entity; + + int active_entities; + + struct rb_root rq_pos_tree; + + struct bfqg_stats stats; +}; + +#else +struct bfq_group { + struct bfq_sched_data sched_data; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct rb_root rq_pos_tree; +}; +#endif + +static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); + +static unsigned int bfq_class_idx(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + return bfqq ? bfqq->ioprio_class - 1 : + BFQ_DEFAULT_GRP_CLASS - 1; +} + +static struct bfq_service_tree * +bfq_entity_service_tree(struct bfq_entity *entity) +{ + struct bfq_sched_data *sched_data = entity->sched_data; + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned int idx = bfq_class_idx(entity); + + BUG_ON(idx >= BFQ_IOPRIO_CLASSES); + BUG_ON(sched_data == NULL); + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx); +#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, + "entity_service_tree %p %d", + sched_data->service_tree + idx, idx); + } +#endif + return sched_data->service_tree + idx; +} + +static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) +{ + return bic->bfqq[is_sync]; +} + +static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, + bool is_sync) +{ + bic->bfqq[is_sync] = bfqq; +} + +static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) +{ + return bic->icq.q->elevator->elevator_data; +} + +#ifdef BFQ_GROUP_IOSCHED_ENABLED + +static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +{ + struct bfq_entity *group_entity = bfqq->entity.parent; + + if (!group_entity) + group_entity = &bfqq->bfqd->root_group->entity; + + return container_of(group_entity, struct bfq_group, entity); +} + +#else + +static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +{ + return bfqq->bfqd->root_group; +} + +#endif + +static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); +static void bfq_put_queue(struct bfq_queue *bfqq); +static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic); +static void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg); +#ifdef BFQ_GROUP_IOSCHED_ENABLED +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +#endif +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); + +#endif /* _BFQ_H */ From 0bd96428e086fd28800efdf5f0a5f62869af6e30 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 21 Jan 2017 12:41:14 +0100 Subject: [PATCH 10/51] Move thinktime from bic to bfqq Prep change to make it possible to protect this field with a scheduler lock. Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 28 ++++++++++++++-------------- block/bfq-mq.h | 30 ++++++++++++++++-------------- 2 files changed, 30 insertions(+), 28 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index d1125aee658c..65f5dfb79417 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -698,6 +698,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, if (unlikely(busy)) old_wr_coeff = bfqq->wr_coeff; + bfqq->ttime = bic->saved_ttime; bfqq->wr_coeff = bic->saved_wr_coeff; bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); @@ -1287,7 +1288,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * details on the usage of the next variable. */ arrived_in_time = ktime_get_ns() <= - RQ_BIC(rq)->ttime.last_end_request + + bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; bfq_log_bfqq(bfqd, bfqq, @@ -2048,6 +2049,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; + bic->saved_ttime = bfqq->ttime; bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); @@ -3948,11 +3950,6 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_queue(bfqq); /* release process reference */ } -static void bfq_init_icq(struct io_cq *icq) -{ - icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); -} - static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); @@ -4084,6 +4081,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_mark_bfqq_just_created(bfqq); } else bfq_clear_bfqq_sync(bfqq); + + bfqq->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); + bfq_mark_bfqq_IO_bound(bfqq); /* Tentative initial value to trade off between thr and lat */ @@ -4191,14 +4191,14 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, } static void bfq_update_io_thinktime(struct bfq_data *bfqd, - struct bfq_io_cq *bic) + struct bfq_queue *bfqq) { - struct bfq_ttime *ttime = &bic->ttime; - u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; + struct bfq_ttime *ttime = &bfqq->ttime; + u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); - ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ttime->ttime_samples); @@ -4240,8 +4240,8 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, * decide whether to mark as has_short_ttime */ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || - (bfq_sample_valid(bic->ttime.ttime_samples) && - bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) + (bfq_sample_valid(bfqq->ttime.ttime_samples) && + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) has_short_ttime = false; bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d", @@ -4265,7 +4265,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (rq->cmd_flags & REQ_META) bfqq->meta_pending++; - bfq_update_io_thinktime(bfqd, bic); + bfq_update_io_thinktime(bfqd, bfqq); bfq_update_has_short_ttime(bfqd, bfqq, bic); bfq_update_io_seektime(bfqd, bfqq, rq); @@ -4436,7 +4436,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) now_ns = ktime_get_ns(); - RQ_BIC(rq)->ttime.last_end_request = now_ns; + bfqq->ttime.last_end_request = now_ns; /* * Using us instead of ns, to get a reasonable precision in diff --git a/block/bfq-mq.h b/block/bfq-mq.h index 53954d1b87f8..0f51f270469c 100644 --- a/block/bfq-mq.h +++ b/block/bfq-mq.h @@ -210,6 +210,18 @@ struct bfq_entity { struct bfq_group; /** + * struct bfq_ttime - per process thinktime stats. + */ +struct bfq_ttime { + u64 last_end_request; /* completion time of last request */ + + u64 ttime_total; /* total process thinktime */ + unsigned long ttime_samples; /* number of thinktime samples */ + u64 ttime_mean; /* average process thinktime */ + +}; + +/** * struct bfq_queue - leaf schedulable entity. * * A bfq_queue is a leaf request queue; it can be associated with an @@ -270,6 +282,9 @@ struct bfq_queue { /* node for active/idle bfqq list inside parent bfqd */ struct list_head bfqq_list; + /* associated @bfq_ttime struct */ + struct bfq_ttime ttime; + /* bit vector: a 1 for each seeky requests in history */ u32 seek_history; @@ -333,18 +348,6 @@ struct bfq_queue { }; /** - * struct bfq_ttime - per process thinktime stats. - */ -struct bfq_ttime { - u64 last_end_request; /* completion time of last request */ - - u64 ttime_total; /* total process thinktime */ - unsigned long ttime_samples; /* number of thinktime samples */ - u64 ttime_mean; /* average process thinktime */ - -}; - -/** * struct bfq_io_cq - per (request_queue, io_context) structure. */ struct bfq_io_cq { @@ -352,8 +355,6 @@ struct bfq_io_cq { struct io_cq icq; /* must be the first member */ /* array of two process queues, the sync and the async */ struct bfq_queue *bfqq[2]; - /* associated @bfq_ttime struct */ - struct bfq_ttime ttime; /* per (request_queue, blkcg) ioprio */ int ioprio; #ifdef BFQ_GROUP_IOSCHED_ENABLED @@ -390,6 +391,7 @@ struct bfq_io_cq { unsigned long saved_last_wr_start_finish; unsigned long saved_wr_start_at_switch_to_srt; unsigned int saved_wr_cur_max_time; + struct bfq_ttime saved_ttime; }; enum bfq_device_speed { From 351a9aea7c0c9c30edacdbf2a3c0d089470de1e8 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 18 Jan 2017 11:42:22 +0100 Subject: [PATCH 11/51] Embed bfq-ioc.c and add locking on request queue The version of bfq-ioc.c for bfq-iosched.c is not correct any more for bfq-mq, because, in bfq-mq, the request queue lock is not being held when bfq_bic_lookup is invoked. That function must then take that look on its own. This commit removes the inclusion of bfq-ioc.c, copies the content of bfq-ioc.c into bfq-mq-iosched.c, and adds the grabbing of the lock. Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 65f5dfb79417..756a618d5902 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -195,7 +195,39 @@ static int device_speed_thresh[2]; static void bfq_schedule_dispatch(struct bfq_data *bfqd); -#include "bfq-ioc.c" +/** + * icq_to_bic - convert iocontext queue structure to bfq_io_cq. + * @icq: the iocontext queue. + */ +static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) +{ + /* bic->icq is the first member, %NULL will convert to %NULL */ + return container_of(icq, struct bfq_io_cq, icq); +} + +/** + * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. + * @bfqd: the lookup key. + * @ioc: the io_context of the process doing I/O. + * @q: the request queue. + */ +static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + struct io_context *ioc, + struct request_queue *q) +{ + if (ioc) { + struct bfq_io_cq *icq; + + spin_lock_irq(q->queue_lock); + icq = icq_to_bic(ioc_lookup_icq(ioc, q)); + spin_unlock_irq(q->queue_lock); + + return icq; + } + + return NULL; +} + #include "bfq-sched.c" #include "bfq-cgroup-included.c" @@ -1520,13 +1552,14 @@ static void bfq_add_request(struct request *rq) } static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - struct bio *bio) + struct bio *bio, + struct request_queue *q) { struct task_struct *tsk = current; struct bfq_io_cq *bic; struct bfq_queue *bfqq; - bic = bfq_bic_lookup(bfqd, tsk->io_context); + bic = bfq_bic_lookup(bfqd, tsk->io_context, q); if (!bic) return NULL; From ed0d64e27b2308813a2a846139e405e0479f0849 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 20 Dec 2016 09:07:19 +0100 Subject: [PATCH 12/51] Modify interface and operation to comply with blk-mq-sched As for modifications of the operation, the major changes are the introduction of a scheduler lock, and the moving to deferred work of the body of the hook exit_icq. The latter change has been made to avoid deadlocks caused by the combination of the following facts: 1) such a body takes the scheduler lock, and, if not deferred, 2) it does so from inside the exit_icq hook, which is invoked with the queue lock held, and 3) there is at least one code path, namely that starting from bfq_bio_merge, which takes these locks in the opposite order. Signed-off-by: Paolo Valente --- block/bfq-cgroup-included.c | 4 - block/bfq-mq-iosched.c | 695 ++++++++++++++++++++++++-------------------- block/bfq-mq.h | 35 +-- 3 files changed, 394 insertions(+), 340 deletions(-) diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c index 9c483b658179..8a73de76f32b 100644 --- a/block/bfq-cgroup-included.c +++ b/block/bfq-cgroup-included.c @@ -472,8 +472,6 @@ static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct bfq_group *bfqg, *parent; struct bfq_entity *entity; - assert_spin_locked(bfqd->queue->queue_lock); - bfqg = bfq_lookup_bfqg(bfqd, blkcg); if (unlikely(!bfqg)) @@ -602,8 +600,6 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, struct bfq_group *bfqg; struct bfq_entity *entity; - lockdep_assert_held(bfqd->queue->queue_lock); - bfqg = bfq_find_set_group(bfqd, blkcg); if (unlikely(!bfqg)) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 756a618d5902..c963d92a32c2 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -81,7 +81,13 @@ #include #include #include +#include +#include + #include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" +#include "blk-mq-sched.h" #undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ #include "bfq-mq.h" @@ -193,8 +199,6 @@ static int device_speed_thresh[2]; #define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -static void bfq_schedule_dispatch(struct bfq_data *bfqd); - /** * icq_to_bic - convert iocontext queue structure to bfq_io_cq. * @icq: the iocontext queue. @@ -216,11 +220,12 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, struct request_queue *q) { if (ioc) { + unsigned long flags; struct bfq_io_cq *icq; - spin_lock_irq(q->queue_lock); + spin_lock_irqsave(q->queue_lock, flags); icq = icq_to_bic(ioc_lookup_icq(ioc, q)); - spin_unlock_irq(q->queue_lock); + spin_unlock_irqrestore(q->queue_lock, flags); return icq; } @@ -244,7 +249,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) { if (bfqd->queued != 0) { bfq_log(bfqd, "schedule dispatch"); - kblockd_schedule_work(&bfqd->unplug_work); + blk_mq_run_hw_queues(bfqd->queue, true); } } @@ -768,9 +773,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) { int process_refs, io_refs; - lockdep_assert_held(bfqq->bfqd->queue->queue_lock); - - io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; + io_refs = bfqq->allocated; process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; BUG_ON(process_refs < 0); return process_refs; @@ -1584,6 +1587,7 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq) return sdist; } +#if 0 /* Still not clear if we can do without next two functions */ static void bfq_activate_request(struct request_queue *q, struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; @@ -1597,8 +1601,10 @@ static void bfq_deactivate_request(struct request_queue *q, struct request *rq) BUG_ON(bfqd->rq_in_driver == 0); bfqd->rq_in_driver--; } +#endif -static void bfq_remove_request(struct request *rq) +static void bfq_remove_request(struct request_queue *q, + struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd = bfqq->bfqd; @@ -1619,6 +1625,10 @@ static void bfq_remove_request(struct request *rq) bfqd->queued--; elv_rb_del(&bfqq->sort_list, rq); + elv_rqhash_del(q, rq); + if (q->last_merge == rq) + q->last_merge = NULL; + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { bfqq->next_rq = NULL; @@ -1659,13 +1669,36 @@ static void bfq_remove_request(struct request *rq) bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); } -static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, - struct bio *bio) +static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) +{ + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *free = NULL; + bool ret; + + spin_lock_irq(&bfqd->lock); + ret = blk_mq_sched_try_merge(q, bio, &free); + + /* + * XXX Not yet freeing without lock held, to avoid an + * inconsistency with respect to the lock-protected invocation + * of blk_mq_sched_try_insert_merge in bfq_bio_merge. Waiting + * for clarifications from Jens. + */ + if (free) + blk_mq_free_request(free); + spin_unlock_irq(&bfqd->lock); + + return ret; +} + +static int bfq_request_merge(struct request_queue *q, struct request **req, + struct bio *bio) { struct bfq_data *bfqd = q->elevator->elevator_data; struct request *__rq; - __rq = bfq_find_rq_fmerge(bfqd, bio); + __rq = bfq_find_rq_fmerge(bfqd, bio, q); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; return ELEVATOR_FRONT_MERGE; @@ -1674,7 +1707,7 @@ static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, return ELEVATOR_NO_MERGE; } -static void bfq_merged_request(struct request_queue *q, struct request *req, +static void bfq_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { if (type == ELEVATOR_FRONT_MERGE && @@ -1689,6 +1722,8 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, /* Reposition request in its sort_list */ elv_rb_del(&bfqq->sort_list, req); elv_rb_add(&bfqq->sort_list, req); + + spin_lock_irq(&bfqd->lock); /* Choose next request to be served for bfqq */ prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, @@ -1704,22 +1739,19 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, bfq_updated_next_req(bfqd, bfqq); bfq_pos_tree_add_move(bfqd, bfqq); } + spin_unlock_irq(&bfqd->lock); } } -#ifdef BFQ_GROUP_IOSCHED_ENABLED -static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) -{ - bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); -} -#endif - -static void bfq_merged_requests(struct request_queue *q, struct request *rq, +static void bfq_requests_merged(struct request_queue *q, struct request *rq, struct request *next) { struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + if (!RB_EMPTY_NODE(&rq->rb_node)) + goto end; + spin_lock_irq(&bfqq->bfqd->lock); + /* * If next and rq belong to the same bfq_queue and next is older * than rq, then reposition rq in the fifo (by substituting next @@ -1740,7 +1772,10 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, if (bfqq->next_rq == next) bfqq->next_rq = rq; - bfq_remove_request(next); + bfq_remove_request(q, next); + + spin_unlock_irq(&bfqq->bfqd->lock); +end: bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); } @@ -1786,7 +1821,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) { struct bfq_queue *bfqq; - spin_lock_irq(bfqd->queue->queue_lock); + spin_lock_irq(&bfqd->lock); list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) bfq_bfqq_end_wr(bfqq); @@ -1794,7 +1829,7 @@ static void bfq_end_wr(struct bfq_data *bfqd) bfq_bfqq_end_wr(bfqq); bfq_end_wr_async(bfqd); - spin_unlock_irq(bfqd->queue->queue_lock); + spin_unlock_irq(&bfqd->lock); } static sector_t bfq_io_struct_pos(void *io_struct, bool request) @@ -2184,8 +2219,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_put_queue(bfqq); } -static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, - struct bio *bio) +static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + struct bio *bio) { struct bfq_data *bfqd = q->elevator->elevator_data; bool is_sync = op_is_sync(bio->bi_opf); @@ -2203,7 +2238,7 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * merge only if rq is queued there. * Queue lock is held here. */ - bic = bfq_bic_lookup(bfqd, current->io_context); + bic = bfq_bic_lookup(bfqd, current->io_context, q); if (!bic) return false; @@ -2228,12 +2263,6 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, return bfqq == RQ_BFQQ(rq); } -static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, - struct request *next) -{ - return RQ_BFQQ(rq) == RQ_BFQQ(next); -} - /* * Set the maximum time for the in-service queue to consume its * budget. This prevents seeky processes from lowering the throughput. @@ -2264,7 +2293,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, { if (bfqq) { bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); - bfq_mark_bfqq_must_alloc(bfqq); bfq_clear_bfqq_fifo_expire(bfqq); bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; @@ -2703,27 +2731,28 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) } /* - * Move request from internal lists to the dispatch list of the request queue + * Remove request from internal lists. */ -static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) +static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); /* - * For consistency, the next instruction should have been executed - * after removing the request from the queue and dispatching it. - * We execute instead this instruction before bfq_remove_request() - * (and hence introduce a temporary inconsistency), for efficiency. - * In fact, in a forced_dispatch, this prevents two counters related - * to bfqq->dispatched to risk to be uselessly decremented if bfqq - * is not in service, and then to be incremented again after - * incrementing bfqq->dispatched. + * For consistency, the next instruction should have been + * executed after removing the request from the queue and + * dispatching it. We execute instead this instruction before + * bfq_remove_request() (and hence introduce a temporary + * inconsistency), for efficiency. In fact, should this + * dispatch occur for a non in-service bfqq, this anticipated + * increment prevents two counters related to bfqq->dispatched + * from risking to be, first, uselessly decremented, and then + * incremented again when the (new) value of bfqq->dispatched + * happens to be taken into account. */ bfqq->dispatched++; bfq_update_peak_rate(q->elevator->elevator_data, rq); - bfq_remove_request(rq); - elv_dispatch_sort(q, rq); + bfq_remove_request(q, rq); } static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) @@ -3605,7 +3634,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); if (bfq_may_expire_for_budg_timeout(bfqq) && - !hrtimer_active(&bfqd->idle_slice_timer) && + !bfq_bfqq_wait_request(bfqq) && !bfq_bfqq_must_idle(bfqq)) goto expire; @@ -3641,7 +3670,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * arrives. */ if (bfq_bfqq_wait_request(bfqq)) { - BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); /* * If we get here: 1) at least a new request * has arrived but we have not disabled the @@ -3668,7 +3696,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) * for a new request, or has requests waiting for a completion and * may idle after their completion, then keep it anyway. */ - if (hrtimer_active(&bfqd->idle_slice_timer) || + if (bfq_bfqq_wait_request(bfqq) || (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { bfqq = NULL; goto keep_queue; @@ -3753,13 +3781,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) } /* - * Dispatch one request from bfqq, moving it to the request queue - * dispatch list. + * Dispatch next request from bfqq. */ -static int bfq_dispatch_request(struct bfq_data *bfqd, - struct bfq_queue *bfqq) +static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, + struct bfq_queue *bfqq) { - int dispatched = 0; struct request *rq = bfqq->next_rq; unsigned long service_to_charge; @@ -3775,7 +3801,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, BUG_ON(bfqq->entity.budget < bfqq->entity.service); - bfq_dispatch_insert(bfqd->queue, rq); + bfq_dispatch_remove(bfqd->queue, rq); /* * If weight raising has to terminate for bfqq, then next @@ -3791,86 +3817,61 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, bfq_update_wr_data(bfqd, bfqq); bfq_log_bfqq(bfqd, bfqq, - "dispatched %u sec req (%llu), budg left %d", + "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", blk_rq_sectors(rq), (unsigned long long) blk_rq_pos(rq), - bfq_bfqq_budget_left(bfqq)); - - dispatched++; + bfq_bfqq_budget_left(bfqq), + bfqq->dispatched); if (!bfqd->in_service_bic) { atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); bfqd->in_service_bic = RQ_BIC(rq); } + /* + * Expire bfqq, pretending that its budget expired, if bfqq + * belongs to CLASS_IDLE and other queues are waiting for + * service. + */ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) goto expire; - return dispatched; + return rq; expire: bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); - return dispatched; -} - -static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -{ - int dispatched = 0; - - while (bfqq->next_rq) { - bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); - dispatched++; - } - - BUG_ON(!list_empty(&bfqq->fifo)); - return dispatched; + return rq; } -/* - * Drain our current requests. - * Used for barriers and when switching io schedulers on-the-fly. - */ -static int bfq_forced_dispatch(struct bfq_data *bfqd) +static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { - struct bfq_queue *bfqq, *n; - struct bfq_service_tree *st; - int dispatched = 0; - - bfqq = bfqd->in_service_queue; - if (bfqq) - __bfq_bfqq_expire(bfqd, bfqq); + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; /* - * Loop through classes, and be careful to leave the scheduler - * in a consistent state, as feedback mechanisms and vtime - * updates cannot be disabled during the process. + * Avoiding lock: a race on bfqd->busy_queues should cause at + * most a call to dispatch for nothing */ - list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { - st = bfq_entity_service_tree(&bfqq->entity); - - dispatched += __bfq_forced_dispatch_bfqq(bfqq); - - bfqq->max_budget = bfq_max_budget(bfqd); - bfq_forget_idle(st); - } - - BUG_ON(bfqd->busy_queues != 0); - - return dispatched; + return !list_empty_careful(&bfqd->dispatch) || + bfqd->busy_queues > 0; } -static int bfq_dispatch_requests(struct request_queue *q, int force) +static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq = NULL; + struct bfq_queue *bfqq = NULL; + + if (!list_empty(&bfqd->dispatch)) { + rq = list_first_entry(&bfqd->dispatch, struct request, + queuelist); + list_del_init(&rq->queuelist); + goto exit; + } bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); if (bfqd->busy_queues == 0) - return 0; - - if (unlikely(force)) - return bfq_forced_dispatch(bfqd); + goto exit; /* * Force device to serve one request at a time if @@ -3885,25 +3886,39 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) * throughput. */ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) - return 0; + goto exit; bfqq = bfq_select_queue(bfqd); if (!bfqq) - return 0; + goto exit; BUG_ON(bfqq->entity.budget < bfqq->entity.service); BUG_ON(bfq_bfqq_wait_request(bfqq)); - if (!bfq_dispatch_request(bfqd, bfqq)) - return 0; - - bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", - bfq_bfqq_sync(bfqq) ? "sync" : "async"); + rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); BUG_ON(bfqq->next_rq == NULL && bfqq->entity.budget < bfqq->entity.service); - return 1; +exit: + if (rq) { + rq->rq_flags |= RQF_STARTED; + bfqd->rq_in_driver++; + } + + return rq; +} + +static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq; + + spin_lock_irq(&bfqd->lock); + rq = __bfq_dispatch_request(hctx); + spin_unlock_irq(&bfqd->lock); + + return rq; } /* @@ -3921,13 +3936,14 @@ static void bfq_put_queue(struct bfq_queue *bfqq) BUG_ON(bfqq->ref <= 0); - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); + if (bfqq->bfqd) + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); + bfqq->ref--; if (bfqq->ref) return; BUG_ON(rb_first(&bfqq->sort_list)); - BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); @@ -3942,7 +3958,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq) */ hlist_del_init(&bfqq->burst_list_node); - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); + if (bfqq->bfqd) + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); kmem_cache_free(bfq_pool, bfqq); #ifdef BFQ_GROUP_IOSCHED_ENABLED @@ -3983,29 +4000,52 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_put_queue(bfqq); /* release process reference */ } -static void bfq_exit_icq(struct io_cq *icq) +static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) { - struct bfq_io_cq *bic = icq_to_bic(icq); - struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + struct bfq_data *bfqd; - if (bic_to_bfqq(bic, false)) { - bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); - bic_set_bfqq(bic, NULL, false); - } + if (bfqq) + bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ - if (bic_to_bfqq(bic, true)) { + if (bfqq && bfqd) { + spin_lock_irq(&bfqd->lock); /* * If the bic is using a shared queue, put the reference * taken on the io_context when the bic started using a * shared bfq_queue. */ - if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) - put_io_context(icq->ioc); - bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); - bic_set_bfqq(bic, NULL, true); + if (is_sync && bfq_bfqq_coop(bfqq)) + put_io_context(bic->icq.ioc); + bfq_exit_bfqq(bfqd, bfqq); + bic_set_bfqq(bic, NULL, is_sync); + spin_unlock_irq(&bfqd->lock); } } +static void bfq_exit_icq_body(struct work_struct *work) +{ + struct bfq_io_cq *bic = + container_of(work, struct bfq_io_cq, exit_icq_work); + + bfq_exit_icq_bfqq(bic, true); + bfq_exit_icq_bfqq(bic, false); +} + +static void bfq_init_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + + INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body); +} + +static void bfq_exit_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + + kblockd_schedule_work(&bic->exit_icq_work); +} + /* * Update the entity prio values; note that the new values will not * be used until the next (re)activation. @@ -4015,6 +4055,10 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, { struct task_struct *tsk = current; int ioprio_class; + struct bfq_data *bfqd = bfqq->bfqd; + + if (!bfqd) + return; ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); switch (ioprio_class) { @@ -4095,6 +4139,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, INIT_HLIST_NODE(&bfqq->burst_list_node); BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + spin_lock_init(&bfqq->lock); + bfqq->ref = 0; bfqq->bfqd = bfqd; @@ -4351,22 +4397,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (budget_timeout) bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_TIMEOUT); - - /* - * Let the request rip immediately, or let a new queue be - * selected if bfqq has just been expired. - */ - __blk_run_queue(bfqd->queue); } } -static void bfq_insert_request(struct request_queue *q, struct request *rq) +static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { - struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; - assert_spin_locked(bfqd->queue->queue_lock); - /* * An unplug may trigger a requeue of a request from the device * driver: make sure we are in process context while trying to @@ -4381,8 +4418,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) * Release the request's reference to the old bfqq * and make sure one is taken to the shared queue. */ - new_bfqq->allocated[rq_data_dir(rq)]++; - bfqq->allocated[rq_data_dir(rq)]--; + new_bfqq->allocated++; + bfqq->allocated--; new_bfqq->ref++; bfq_clear_bfqq_just_created(bfqq); if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) @@ -4406,6 +4443,55 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) bfq_rq_enqueued(bfqd, bfqq, rq); } +static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) +{ + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + + spin_lock_irq(&bfqd->lock); + if (blk_mq_sched_try_insert_merge(q, rq)) + goto done; + spin_unlock_irq(&bfqd->lock); + + blk_mq_sched_request_inserted(rq); + + spin_lock_irq(&bfqd->lock); + if (at_head || blk_rq_is_passthrough(rq)) { + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + if (at_head) + list_add(&rq->queuelist, &bfqd->dispatch); + else + list_add_tail(&rq->queuelist, &bfqd->dispatch); + + if (bfqq) + bfqq->dispatched++; + } else { + __bfq_insert_request(bfqd, rq); + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); + if (!q->last_merge) + q->last_merge = rq; + } + } +done: + spin_unlock_irq(&bfqd->lock); +} + +static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *list, bool at_head) +{ + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + bfq_insert_request(hctx, rq, at_head); + } +} + static void bfq_update_hw_tag(struct bfq_data *bfqd) { bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, @@ -4431,27 +4517,17 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) bfqd->hw_tag_samples = 0; } -static void bfq_completed_request(struct request_queue *q, struct request *rq) +static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; u64 now_ns; u32 delta_us; - bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", - blk_rq_sectors(rq)); - - assert_spin_locked(bfqd->queue->queue_lock); bfq_update_hw_tag(bfqd); BUG_ON(!bfqd->rq_in_driver); BUG_ON(!bfqq->dispatched); bfqd->rq_in_driver--; bfqq->dispatched--; - bfqg_stats_update_completion(bfqq_group(bfqq), - rq_start_time_ns(rq), - rq_io_start_time_ns(rq), - rq->cmd_flags); if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); @@ -4477,7 +4553,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) */ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); - bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", + bfq_log_bfqq(bfqd, bfqq, + "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, (USEC_PER_SEC* (u64)((bfqd->last_rq_max_size<in_service_queue == bfqq) { if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { bfq_arm_slice_timer(bfqd); - goto out; + return; } else if (bfq_may_expire_for_budg_timeout(bfqq)) bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_TIMEOUT); @@ -4537,68 +4614,55 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_NO_MORE_REQUESTS); } - - if (!bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); - -out: - return; } -static int __bfq_may_queue(struct bfq_queue *bfqq) +static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) { - if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { - bfq_clear_bfqq_must_alloc(bfqq); - return ELV_MQUEUE_MUST; - } + bfqq->allocated--; - return ELV_MQUEUE_MAY; + bfq_put_queue(bfqq); } -static int bfq_may_queue(struct request_queue *q, unsigned int op) +static void bfq_put_rq_private(struct request_queue *q, struct request *rq) { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct task_struct *tsk = current; - struct bfq_io_cq *bic; - struct bfq_queue *bfqq; - - /* - * Don't force setup of a queue from here, as a call to may_queue - * does not necessarily imply that a request actually will be - * queued. So just lookup a possibly existing queue, or return - * 'may queue' if that fails. - */ - bic = bfq_bic_lookup(bfqd, tsk->io_context); - if (!bic) - return ELV_MQUEUE_MAY; - - bfqq = bic_to_bfqq(bic, op_is_sync(op)); - if (bfqq) - return __bfq_may_queue(bfqq); + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; - return ELV_MQUEUE_MAY; -} + if (rq->rq_flags & RQF_STARTED) + bfqg_stats_update_completion(bfqq_group(bfqq), + rq_start_time_ns(rq), + rq_io_start_time_ns(rq), + rq->cmd_flags); -/* - * Queue lock held here. - */ -static void bfq_put_request(struct request *rq) -{ - struct bfq_queue *bfqq = RQ_BFQQ(rq); + if (likely(rq->rq_flags & RQF_STARTED)) { + unsigned long flags; - if (bfqq) { - const int rw = rq_data_dir(rq); + spin_lock_irqsave(&bfqd->lock, flags); - BUG_ON(!bfqq->allocated[rw]); - bfqq->allocated[rw]--; + bfq_completed_request(bfqq, bfqd); + bfq_put_rq_priv_body(bfqq); - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; + spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + /* + * Request rq may be still/already in the scheduler, + * in which case we need to remove it. And we cannot + * defer such a check and removal, to avoid + * inconsistencies in the time interval from the end + * of this function to the start of the deferred work. + * Fortunately, this situation occurs only in process + * context, so taking the scheduler lock does not + * cause any deadlock, even if other locks are already + * (correctly) held by this process. + */ - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", - bfqq, bfqq->ref); - bfq_put_queue(bfqq); + if (!RB_EMPTY_NODE(&rq->rb_node)) + bfq_remove_request(q, rq); + bfq_put_rq_priv_body(bfqq); } + + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; } /* @@ -4630,18 +4694,16 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) /* * Allocate bfq data structures associated with this request. */ -static int bfq_set_request(struct request_queue *q, struct request *rq, - struct bio *bio, gfp_t gfp_mask) +static int bfq_get_rq_private(struct request_queue *q, struct request *rq, + struct bio *bio) { struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); - const int rw = rq_data_dir(rq); const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; - unsigned long flags; bool bfqq_already_existing = false, split = false; - spin_lock_irqsave(q->queue_lock, flags); + spin_lock_irq(&bfqd->lock); if (!bic) goto queue_fail; @@ -4661,7 +4723,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { bfq_log_bfqq(bfqd, bfqq, - "set_request: was_in_list %d " + "get_request: was_in_list %d " "was_in_large_burst %d " "large burst in progress %d", bic->was_in_burst_list, @@ -4671,12 +4733,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, if ((bic->was_in_burst_list && bfqd->large_burst) || bic->saved_in_large_burst) { bfq_log_bfqq(bfqd, bfqq, - "set_request: marking in " + "get_request: marking in " "large burst"); bfq_mark_bfqq_in_large_burst(bfqq); } else { bfq_log_bfqq(bfqd, bfqq, - "set_request: clearing in " + "get_request: clearing in " "large burst"); bfq_clear_bfqq_in_large_burst(bfqq); if (bic->was_in_burst_list) @@ -4703,9 +4765,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, } } - bfqq->allocated[rw]++; + bfqq->allocated++; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "get_request: new allocated %d", bfqq->allocated); + bfqq->ref++; - bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref); rq->elv.priv[0] = bic; rq->elv.priv[1] = bfqq; @@ -4733,26 +4798,53 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, if (unlikely(bfq_bfqq_just_created(bfqq))) bfq_handle_burst(bfqd, bfqq); - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irq(&bfqd->lock); return 0; queue_fail: - bfq_schedule_dispatch(bfqd); - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irq(&bfqd->lock); return 1; } -static void bfq_kick_queue(struct work_struct *work) +static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) { - struct bfq_data *bfqd = - container_of(work, struct bfq_data, unplug_work); - struct request_queue *q = bfqd->queue; + struct bfq_data *bfqd = bfqq->bfqd; + enum bfqq_expiration reason; + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); + bfq_clear_bfqq_wait_request(bfqq); - spin_lock_irq(q->queue_lock); - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); + if (bfqq != bfqd->in_service_queue) { + spin_unlock_irqrestore(&bfqd->lock, flags); + return; + } + + if (bfq_bfqq_budget_timeout(bfqq)) + /* + * Also here the queue can be safely expired + * for budget timeout without wasting + * guarantees + */ + reason = BFQ_BFQQ_BUDGET_TIMEOUT; + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) + /* + * The queue may not be empty upon timer expiration, + * because we may not disable the timer when the + * first request of the in-service queue arrives + * during disk idling. + */ + reason = BFQ_BFQQ_TOO_IDLE; + else + goto schedule_dispatch; + + bfq_bfqq_expire(bfqd, bfqq, true, reason); + +schedule_dispatch: + spin_unlock_irqrestore(&bfqd->lock, flags); + bfq_schedule_dispatch(bfqd); } /* @@ -4763,59 +4855,22 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) { struct bfq_data *bfqd = container_of(timer, struct bfq_data, idle_slice_timer); - struct bfq_queue *bfqq; - unsigned long flags; - enum bfqq_expiration reason; - - spin_lock_irqsave(bfqd->queue->queue_lock, flags); + struct bfq_queue *bfqq = bfqd->in_service_queue; - bfqq = bfqd->in_service_queue; /* * Theoretical race here: the in-service queue can be NULL or - * different from the queue that was idling if the timer handler - * spins on the queue_lock and a new request arrives for the - * current queue and there is a full dispatch cycle that changes - * the in-service queue. This can hardly happen, but in the worst - * case we just expire a queue too early. + * different from the queue that was idling if a new request + * arrives for the current queue and there is a full dispatch + * cycle that changes the in-service queue. This can hardly + * happen, but in the worst case we just expire a queue too + * early. */ - if (bfqq) { - bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); - bfq_clear_bfqq_wait_request(bfqq); - - if (bfq_bfqq_budget_timeout(bfqq)) - /* - * Also here the queue can be safely expired - * for budget timeout without wasting - * guarantees - */ - reason = BFQ_BFQQ_BUDGET_TIMEOUT; - else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) - /* - * The queue may not be empty upon timer expiration, - * because we may not disable the timer when the - * first request of the in-service queue arrives - * during disk idling. - */ - reason = BFQ_BFQQ_TOO_IDLE; - else - goto schedule_dispatch; - - bfq_bfqq_expire(bfqd, bfqq, true, reason); - } - -schedule_dispatch: - bfq_schedule_dispatch(bfqd); + if (bfqq) + bfq_idle_slice_timer_body(bfqq); - spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); return HRTIMER_NORESTART; } -static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -{ - hrtimer_cancel(&bfqd->idle_slice_timer); - cancel_work_sync(&bfqd->unplug_work); -} - static void __bfq_put_async_bfqq(struct bfq_data *bfqd, struct bfq_queue **bfqq_ptr) { @@ -4852,28 +4907,40 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) static void bfq_exit_queue(struct elevator_queue *e) { struct bfq_data *bfqd = e->elevator_data; - struct request_queue *q = bfqd->queue; struct bfq_queue *bfqq, *n; - bfq_shutdown_timer_wq(bfqd); - - spin_lock_irq(q->queue_lock); + hrtimer_cancel(&bfqd->idle_slice_timer); BUG_ON(bfqd->in_service_queue); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) - bfq_deactivate_bfqq(bfqd, bfqq, false, false); - spin_unlock_irq(q->queue_lock); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { + if (bfqq->bic) /* bfqqs without bic are handled below */ + cancel_work_sync(&bfqq->bic->exit_icq_work); + } + + spin_lock_irq(&bfqd->lock); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { + bfq_deactivate_bfqq(bfqd, bfqq, false, false); + /* + * Make sure that deferred exit_icq_work completes + * without errors for bfq_queues without bic + */ + if (!bfqq->bic) + bfqq->bfqd = NULL; + } + spin_unlock_irq(&bfqd->lock); - bfq_shutdown_timer_wq(bfqd); + hrtimer_cancel(&bfqd->idle_slice_timer); BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); #ifdef BFQ_GROUP_IOSCHED_ENABLED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); + blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); #else + spin_lock_irq(&bfqd->lock); bfq_put_async_queues(bfqd, bfqd->root_group); kfree(bfqd->root_group); + spin_unlock_irq(&bfqd->lock); #endif kfree(bfqd); @@ -4934,10 +5001,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->queue = q; - spin_lock_irq(q->queue_lock); - q->elevator = eq; - spin_unlock_irq(q->queue_lock); - bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); if (!bfqd->root_group) goto out_free; @@ -4951,8 +5014,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->queue_weights_tree = RB_ROOT; bfqd->group_weights_tree = RB_ROOT; - INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); - INIT_LIST_HEAD(&bfqd->active_list); INIT_LIST_HEAD(&bfqd->idle_list); INIT_HLIST_HEAD(&bfqd->burst_list); @@ -5001,6 +5062,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; bfqd->device_speed = BFQ_BFQD_FAST; + spin_lock_init(&bfqd->lock); + INIT_LIST_HEAD(&bfqd->dispatch); + + q->elevator = eq; + return 0; out_free: @@ -5057,7 +5123,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", bfqd->queued); - spin_lock_irq(bfqd->queue->queue_lock); + spin_lock_irq(&bfqd->lock); num_char += sprintf(page + num_char, "Active:\n"); list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { @@ -5086,7 +5152,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) jiffies_to_msecs(bfqq->wr_cur_max_time)); } - spin_unlock_irq(bfqd->queue->queue_lock); + spin_unlock_irq(&bfqd->lock); return num_char; } @@ -5294,35 +5360,31 @@ static struct elv_fs_entry bfq_attrs[] = { __ATTR_NULL }; -static struct elevator_type iosched_bfq = { - .ops.sq = { - .elevator_merge_fn = bfq_merge, - .elevator_merged_fn = bfq_merged_request, - .elevator_merge_req_fn = bfq_merged_requests, -#ifdef BFQ_GROUP_IOSCHED_ENABLED - .elevator_bio_merged_fn = bfq_bio_merged, -#endif - .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, - .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, - .elevator_dispatch_fn = bfq_dispatch_requests, - .elevator_add_req_fn = bfq_insert_request, - .elevator_activate_req_fn = bfq_activate_request, - .elevator_deactivate_req_fn = bfq_deactivate_request, - .elevator_completed_req_fn = bfq_completed_request, - .elevator_former_req_fn = elv_rb_former_request, - .elevator_latter_req_fn = elv_rb_latter_request, - .elevator_init_icq_fn = bfq_init_icq, - .elevator_exit_icq_fn = bfq_exit_icq, - .elevator_set_req_fn = bfq_set_request, - .elevator_put_req_fn = bfq_put_request, - .elevator_may_queue_fn = bfq_may_queue, - .elevator_init_fn = bfq_init_queue, - .elevator_exit_fn = bfq_exit_queue, +static struct elevator_type iosched_bfq_mq = { + .ops.mq = { + .get_rq_priv = bfq_get_rq_private, + .put_rq_priv = bfq_put_rq_private, + .init_icq = bfq_init_icq, + .exit_icq = bfq_exit_icq, + .insert_requests = bfq_insert_requests, + .dispatch_request = bfq_dispatch_request, + .next_request = elv_rb_latter_request, + .former_request = elv_rb_former_request, + .allow_merge = bfq_allow_bio_merge, + .bio_merge = bfq_bio_merge, + .request_merge = bfq_request_merge, + .requests_merged = bfq_requests_merged, + .request_merged = bfq_request_merged, + .has_work = bfq_has_work, + .init_sched = bfq_init_queue, + .exit_sched = bfq_exit_queue, }, + + .uses_mq = true, .icq_size = sizeof(struct bfq_io_cq), .icq_align = __alignof__(struct bfq_io_cq), .elevator_attrs = bfq_attrs, - .elevator_name = "bfq-sq", + .elevator_name = "bfq-mq", .elevator_owner = THIS_MODULE, }; @@ -5392,7 +5454,7 @@ static int __init bfq_init(void) device_speed_thresh[0] = (4 * R_slow[0]) / 3; device_speed_thresh[1] = (4 * R_slow[1]) / 3; - ret = elv_register(&iosched_bfq); + ret = elv_register(&iosched_bfq_mq); if (ret) goto err_pol_unreg; @@ -5412,8 +5474,8 @@ static int __init bfq_init(void) static void __exit bfq_exit(void) { - elv_unregister(&iosched_bfq); -#ifdef BFQ_GROUP_IOSCHED_ENABLED + elv_unregister(&iosched_bfq_mq); +#ifdef CONFIG_BFQ_GROUP_ENABLED blkcg_policy_unregister(&blkcg_policy_bfq); #endif bfq_slab_kill(); @@ -5422,5 +5484,6 @@ static void __exit bfq_exit(void) module_init(bfq_init); module_exit(bfq_exit); -MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); +MODULE_AUTHOR("Paolo Valente"); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); diff --git a/block/bfq-mq.h b/block/bfq-mq.h index 0f51f270469c..c3fcd5ebd735 100644 --- a/block/bfq-mq.h +++ b/block/bfq-mq.h @@ -19,15 +19,8 @@ #include #include -/* - * Define an alternative macro to compile cgroups support. This is one - * of the steps needed to let bfq-mq share the files bfq-sched.c and - * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro - * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether - * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not - * CONFIG_BFQ_GROUP_IOSCHED, is defined. - */ -#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED +/* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ +#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED #define BFQ_GROUP_IOSCHED_ENABLED #endif @@ -259,8 +252,8 @@ struct bfq_queue { struct request *next_rq; /* number of sync and async requests queued */ int queued[2]; - /* number of sync and async requests currently allocated */ - int allocated[2]; + /* number of requests currently allocated */ + int allocated; /* number of pending metadata requests */ int meta_pending; /* fifo list of requests in sort_list */ @@ -345,6 +338,8 @@ struct bfq_queue { unsigned long wr_start_at_switch_to_srt; unsigned long split_time; /* time of last split */ + + spinlock_t lock; }; /** @@ -361,6 +356,9 @@ struct bfq_io_cq { uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif + /* delayed work to exec the body of the the exit_icq handler */ + struct work_struct exit_icq_work; + /* * Snapshot of the has_short_time flag before merging; taken * to remember its value while the queue is merged, so as to @@ -402,11 +400,13 @@ enum bfq_device_speed { /** * struct bfq_data - per-device data structure. * - * All the fields are protected by the @queue lock. + * All the fields are protected by @lock. */ struct bfq_data { - /* request queue for the device */ + /* device request queue */ struct request_queue *queue; + /* dispatch queue */ + struct list_head dispatch; /* root bfq_group for the device */ struct bfq_group *root_group; @@ -460,8 +460,6 @@ struct bfq_data { * the queue in service. */ struct hrtimer idle_slice_timer; - /* delayed work to restart dispatching on the request queue */ - struct work_struct unplug_work; /* bfq_queue in service */ struct bfq_queue *in_service_queue; @@ -612,6 +610,8 @@ struct bfq_data { /* fallback dummy bfqq for extreme OOM conditions */ struct bfq_queue oom_bfqq; + + spinlock_t lock; }; enum bfqq_state_flags { @@ -622,7 +622,6 @@ enum bfqq_state_flags { * waiting for a request * without idling the device */ - BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ @@ -661,7 +660,6 @@ BFQ_BFQQ_FNS(just_created); BFQ_BFQQ_FNS(busy); BFQ_BFQQ_FNS(wait_request); BFQ_BFQQ_FNS(non_blocking_wait_rq); -BFQ_BFQQ_FNS(must_alloc); BFQ_BFQQ_FNS(fifo_expire); BFQ_BFQQ_FNS(has_short_ttime); BFQ_BFQQ_FNS(sync); @@ -692,7 +690,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ char __pbuf[128]; \ \ - assert_spin_locked((bfqd)->queue->queue_lock); \ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ pr_crit("%s bfq%d%c %s " fmt "\n", \ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ @@ -734,7 +731,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ char __pbuf[128]; \ \ - assert_spin_locked((bfqd)->queue->queue_lock); \ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ (bfqq)->pid, \ @@ -961,7 +957,6 @@ static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); static void bfq_put_queue(struct bfq_queue *bfqq); -static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, struct bfq_io_cq *bic); From bde5235de2241502c1c00337bd51c96d9b60b6df Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 3 Mar 2017 08:52:40 +0100 Subject: [PATCH 13/51] Add checks and extra log messages - Part I Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 109 insertions(+), 3 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index c963d92a32c2..40eadb3f7073 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -773,6 +773,8 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) { int process_refs, io_refs; + lockdep_assert_held(&bfqq->bfqd->lock); + io_refs = bfqq->allocated; process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; BUG_ON(process_refs < 0); @@ -1483,6 +1485,8 @@ static void bfq_add_request(struct request *rq) bfqq->queued[rq_is_sync(rq)]++; bfqd->queued++; + BUG_ON(!RQ_BFQQ(rq)); + BUG_ON(RQ_BFQQ(rq) != bfqq); elv_rb_add(&bfqq->sort_list, rq); /* @@ -1491,6 +1495,8 @@ static void bfq_add_request(struct request *rq) prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); BUG_ON(!next_rq); + BUG_ON(!RQ_BFQQ(next_rq)); + BUG_ON(RQ_BFQQ(next_rq) != bfqq); bfqq->next_rq = next_rq; /* @@ -1615,6 +1621,19 @@ static void bfq_remove_request(struct request_queue *q, if (bfqq->next_rq == rq) { bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); + if (bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)) { + pr_crit("no bfqq! for next rq %p bfqq %p\n", + bfqq->next_rq, bfqq); + } + + BUG_ON(bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)); + if (bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq) { + pr_crit( + "wrong bfqq! for next rq %p, rq_bfqq %p bfqq %p\n", + bfqq->next_rq, RQ_BFQQ(bfqq->next_rq), bfqq); + } + BUG_ON(bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq); + bfq_updated_next_req(bfqd, bfqq); } @@ -1701,6 +1720,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, __rq = bfq_find_rq_fmerge(bfqd, bio, q); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + bfq_log(bfqd, "request_merge: req %p", __rq); + return ELEVATOR_FRONT_MERGE; } @@ -1721,6 +1742,8 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, /* Reposition request in its sort_list */ elv_rb_del(&bfqq->sort_list, req); + BUG_ON(!RQ_BFQQ(req)); + BUG_ON(RQ_BFQQ(req) != bfqq); elv_rb_add(&bfqq->sort_list, req); spin_lock_irq(&bfqd->lock); @@ -1729,7 +1752,13 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, bfqd->last_position); BUG_ON(!next_rq); + bfqq->next_rq = next_rq; + + bfq_log_bfqq(bfqd, bfqq, + "requests_merged: req %p prev %p next_rq %p bfqq %p", + req, prev, next_rq, bfqq); + /* * If next_rq changes, update both the queue's budget to * fit the new request and the queue's position in its @@ -1748,8 +1777,16 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, { struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); + BUG_ON(!RQ_BFQQ(rq)); + BUG_ON(!RQ_BFQQ(next)); + if (!RB_EMPTY_NODE(&rq->rb_node)) goto end; + + bfq_log_bfqq(bfqq->bfqd, bfqq, + "requests_merged: rq %p next %p bfqq %p next_bfqq %p", + rq, next, bfqq, next_bfqq); + spin_lock_irq(&bfqq->bfqd->lock); /* @@ -3847,6 +3884,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + bfq_log(bfqd, "has_work, dispatch_non_empty %d busy_queues %d", + !list_empty_careful(&bfqd->dispatch), bfqd->busy_queues > 0); + /* * Avoiding lock: a race on bfqd->busy_queues should cause at * most a call to dispatch for nothing @@ -3865,6 +3905,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = list_first_entry(&bfqd->dispatch, struct request, queuelist); list_del_init(&rq->queuelist); + bfq_log(bfqd, + "dispatch requests: picked %p from dispatch list", rq); goto exit; } @@ -3904,7 +3946,20 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) if (rq) { rq->rq_flags |= RQF_STARTED; bfqd->rq_in_driver++; - } + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "dispatched %s request %p, rq_in_driver %d", + bfq_bfqq_sync(bfqq) ? "sync" : "async", + rq, + bfqd->rq_in_driver); + else + bfq_log(bfqd, + "dispatched request %p from dispatch list, rq_in_driver %d", + rq, bfqd->rq_in_driver); + } else + bfq_log(bfqd, + "returned NULL request, rq_in_driver %d", + bfqd->rq_in_driver); return rq; } @@ -3944,6 +3999,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) return; BUG_ON(rb_first(&bfqq->sort_list)); + BUG_ON(bfqq->allocated != 0); BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); @@ -4043,6 +4099,7 @@ static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); + BUG_ON(!bic); kblockd_schedule_work(&bic->exit_icq_work); } @@ -4057,6 +4114,7 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, int ioprio_class; struct bfq_data *bfqd = bfqq->bfqd; + WARN_ON(!bfqd); if (!bfqd) return; @@ -4404,6 +4462,10 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + assert_spin_locked(&bfqd->lock); + + bfq_log_bfqq(bfqd, bfqq, "__insert_req: rq %p bfqq %p", rq, bfqq); + /* * An unplug may trigger a requeue of a request from the device * driver: make sure we are in process context while trying to @@ -4420,6 +4482,12 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) */ new_bfqq->allocated++; bfqq->allocated--; + bfq_log_bfqq(bfqd, bfqq, + "insert_request: new allocated %d", bfqq->allocated); + bfq_log_bfqq(bfqd, new_bfqq, + "insert_request: new_bfqq new allocated %d", + bfqq->allocated); + new_bfqq->ref++; bfq_clear_bfqq_just_created(bfqq); if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) @@ -4529,6 +4597,10 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfqd->rq_in_driver--; bfqq->dispatched--; + bfq_log_bfqq(bfqd, bfqq, + "completed_requests: new disp %d, new rq_in_driver %d", + bfqq->dispatched, bfqd->rq_in_driver); + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); /* @@ -4618,6 +4690,9 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "put_request_body: allocated %d", bfqq->allocated); + BUG_ON(!bfqq->allocated); bfqq->allocated--; bfq_put_queue(bfqq); @@ -4625,8 +4700,27 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) static void bfq_put_rq_private(struct request_queue *q, struct request *rq) { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; + struct bfq_queue *bfqq; + struct bfq_data *bfqd; + struct bfq_io_cq *bic; + + BUG_ON(!rq); + bfqq = RQ_BFQQ(rq); + BUG_ON(!bfqq); + + bic = RQ_BIC(rq); + BUG_ON(!bic); + + bfqd = bfqq->bfqd; + BUG_ON(!bfqd); + + BUG_ON(rq->rq_flags & RQF_QUEUED); + BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); + + bfq_log_bfqq(bfqd, bfqq, + "putting rq %p with %u sects left, STARTED %d", + rq, blk_rq_sectors(rq), + rq->rq_flags & RQF_STARTED); if (rq->rq_flags & RQF_STARTED) bfqg_stats_update_completion(bfqq_group(bfqq), @@ -4634,6 +4728,8 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) rq_io_start_time_ns(rq), rq->cmd_flags); + BUG_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); + if (likely(rq->rq_flags & RQF_STARTED)) { unsigned long flags; @@ -4655,7 +4751,9 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) * cause any deadlock, even if other locks are already * (correctly) held by this process. */ + BUG_ON(in_interrupt()); + assert_spin_locked(&bfqd->lock); if (!RB_EMPTY_NODE(&rq->rb_node)) bfq_remove_request(q, rq); bfq_put_rq_priv_body(bfqq); @@ -4814,7 +4912,9 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) enum bfqq_expiration reason; unsigned long flags; + BUG_ON(!bfqd); spin_lock_irqsave(&bfqd->lock, flags); + bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); bfq_clear_bfqq_wait_request(bfqq); if (bfqq != bfqd->in_service_queue) { @@ -4857,6 +4957,8 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) idle_slice_timer); struct bfq_queue *bfqq = bfqd->in_service_queue; + bfq_log(bfqd, "slice_timer expired"); + /* * Theoretical race here: the in-service queue can be NULL or * different from the queue that was idling if a new request @@ -4909,9 +5011,12 @@ static void bfq_exit_queue(struct elevator_queue *e) struct bfq_data *bfqd = e->elevator_data; struct bfq_queue *bfqq, *n; + bfq_log(bfqd, "exit_queue: starting ..."); + hrtimer_cancel(&bfqd->idle_slice_timer); BUG_ON(bfqd->in_service_queue); + BUG_ON(!list_empty(&bfqd->active_list)); list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { if (bfqq->bic) /* bfqqs without bic are handled below */ @@ -4943,6 +5048,7 @@ static void bfq_exit_queue(struct elevator_queue *e) spin_unlock_irq(&bfqd->lock); #endif + bfq_log(bfqd, "exit_queue: finished ..."); kfree(bfqd); } From 7f59486861e368d25f59d4136cf8e51a75b7edf9 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 9 Feb 2017 10:36:27 +0100 Subject: [PATCH 14/51] Add lock check in bfq_allow_bio_merge Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 40eadb3f7073..21b876aeba16 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -2279,6 +2279,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, if (!bic) return false; + assert_spin_locked(&bfqd->lock); bfqq = bic_to_bfqq(bic, is_sync); /* * We take advantage of this function to perform an early merge From a2dd19a4d95cf401268c144c79ce549c7fc4bbca Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 7 Feb 2017 15:14:29 +0100 Subject: [PATCH 15/51] bfq-mq: execute exit_icq operations immediately Exploting Omar's patch that removes the taking of the queue lock in put_io_context_active, this patch moves back the operation of the bfq_exit_icq hook from a deferred work to the body of the function. Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 34 +++------------------------------- block/bfq-mq.h | 3 --- 2 files changed, 3 insertions(+), 34 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 21b876aeba16..1deb79a47181 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4080,28 +4080,13 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) } } -static void bfq_exit_icq_body(struct work_struct *work) -{ - struct bfq_io_cq *bic = - container_of(work, struct bfq_io_cq, exit_icq_work); - - bfq_exit_icq_bfqq(bic, true); - bfq_exit_icq_bfqq(bic, false); -} - -static void bfq_init_icq(struct io_cq *icq) -{ - struct bfq_io_cq *bic = icq_to_bic(icq); - - INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body); -} - static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); BUG_ON(!bic); - kblockd_schedule_work(&bic->exit_icq_work); + bfq_exit_icq_bfqq(bic, true); + bfq_exit_icq_bfqq(bic, false); } /* @@ -5019,21 +5004,9 @@ static void bfq_exit_queue(struct elevator_queue *e) BUG_ON(bfqd->in_service_queue); BUG_ON(!list_empty(&bfqd->active_list)); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { - if (bfqq->bic) /* bfqqs without bic are handled below */ - cancel_work_sync(&bfqq->bic->exit_icq_work); - } - spin_lock_irq(&bfqd->lock); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) { + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) bfq_deactivate_bfqq(bfqd, bfqq, false, false); - /* - * Make sure that deferred exit_icq_work completes - * without errors for bfq_queues without bic - */ - if (!bfqq->bic) - bfqq->bfqd = NULL; - } spin_unlock_irq(&bfqd->lock); hrtimer_cancel(&bfqd->idle_slice_timer); @@ -5471,7 +5444,6 @@ static struct elevator_type iosched_bfq_mq = { .ops.mq = { .get_rq_priv = bfq_get_rq_private, .put_rq_priv = bfq_put_rq_private, - .init_icq = bfq_init_icq, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, diff --git a/block/bfq-mq.h b/block/bfq-mq.h index c3fcd5ebd735..23744b246db6 100644 --- a/block/bfq-mq.h +++ b/block/bfq-mq.h @@ -356,9 +356,6 @@ struct bfq_io_cq { uint64_t blkcg_serial_nr; /* the current blkcg serial */ #endif - /* delayed work to exec the body of the the exit_icq handler */ - struct work_struct exit_icq_work; - /* * Snapshot of the has_short_time flag before merging; taken * to remember its value while the queue is merged, so as to From ab7e78a0ff095101de74e700f8743295a500bb20 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 21 Feb 2017 10:26:22 +0100 Subject: [PATCH 16/51] Unnest request-queue and ioc locks from scheduler locks In some bio-merging functions, the request-queue lock needs to be taken, to lookup for the bic associated with the process that issued the bio that may need to be merged. In addition, put_io_context must be invoked in some other functions, and put_io_context may cause the lock of the involved ioc to be taken. In both cases, these extra request-queue or ioc locks are taken, or might be taken, while the scheduler lock is being held. In this respect, there are other code paths, in part external to bfq-mq, in which the same locks are taken (nested) in the opposite order, i.e., it is the scheduler lock to be taken while the request-queue or the ioc lock is being held. This leads to circular deadlocks. This commit addresses this issue by modifying the logic of the above functions, so as to let the lookup and put_io_context be performed, and thus the extra locks be taken, outside the critical sections protected by the scheduler lock. Signed-off-by: Paolo Valente --- block/bfq-cgroup-included.c | 9 ++ block/bfq-mq-iosched.c | 264 ++++++++++++++++++++++++++++---------------- block/bfq-mq.h | 25 ++++- block/bfq-sched.c | 11 ++ 4 files changed, 213 insertions(+), 96 deletions(-) diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c index 8a73de76f32b..cf59eeb7f08e 100644 --- a/block/bfq-cgroup-included.c +++ b/block/bfq-cgroup-included.c @@ -716,6 +716,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) struct bfq_group *bfqg; struct bfq_data *bfqd; struct bfq_entity *entity; +#ifdef BFQ_MQ + unsigned long flags; +#endif int i; BUG_ON(!pd); @@ -729,6 +732,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) if (!entity) /* root group */ return; +#ifdef BFQ_MQ + spin_lock_irqsave(&bfqd->lock, flags); +#endif /* * Empty all service_trees belonging to this group before * deactivating the group itself. @@ -766,6 +772,9 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) __bfq_deactivate_entity(entity, false); bfq_put_async_queues(bfqd, bfqg); +#ifdef BFQ_MQ + bfq_unlock_put_ioc_restore(bfqd, flags); +#endif /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 1deb79a47181..69ef3761c95d 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -233,6 +233,7 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, return NULL; } +#define BFQ_MQ #include "bfq-sched.c" #include "bfq-cgroup-included.c" @@ -1564,15 +1565,9 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, struct bio *bio, struct request_queue *q) { - struct task_struct *tsk = current; - struct bfq_io_cq *bic; - struct bfq_queue *bfqq; + struct bfq_queue *bfqq = bfqd->bio_bfqq; - bic = bfq_bic_lookup(bfqd, tsk->io_context, q); - if (!bic) - return NULL; - bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); if (bfqq) return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); @@ -1693,9 +1688,26 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; struct request *free = NULL; + /* + * bfq_bic_lookup grabs the queue_lock: invoke it now and + * store its return value for later use, to avoid nesting + * queue_lock inside the bfqd->lock. We assume that the bic + * returned by bfq_bic_lookup does not go away before + * bfqd->lock is taken. + */ + struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); bool ret; spin_lock_irq(&bfqd->lock); + + if (bic) + bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); + else + bfqd->bio_bfqq = NULL; + bfqd->bio_bic = bic; + /* Set next flag just for testing purposes */ + bfqd->bio_bfqq_set = true; + ret = blk_mq_sched_try_merge(q, bio, &free); /* @@ -1706,6 +1718,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) */ if (free) blk_mq_free_request(free); + bfqd->bio_bfqq_set = false; spin_unlock_irq(&bfqd->lock); return ret; @@ -2261,8 +2274,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, { struct bfq_data *bfqd = q->elevator->elevator_data; bool is_sync = op_is_sync(bio->bi_opf); - struct bfq_io_cq *bic; - struct bfq_queue *bfqq, *new_bfqq; + struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; /* * Disallow merge of a sync bio into an async request. @@ -2273,31 +2285,40 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, /* * Lookup the bfqq that this bio will be queued with. Allow * merge only if rq is queued there. - * Queue lock is held here. */ - bic = bfq_bic_lookup(bfqd, current->io_context, q); - if (!bic) + if (!bfqq) return false; - assert_spin_locked(&bfqd->lock); - bfqq = bic_to_bfqq(bic, is_sync); /* * We take advantage of this function to perform an early merge * of the queues of possible cooperating processes. */ - if (bfqq) { - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); - if (new_bfqq) { - bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); - /* - * If we get here, the bio will be queued in the - * shared queue, i.e., new_bfqq, so use new_bfqq - * to decide whether bio and rq can be merged. - */ - bfqq = new_bfqq; - } - } + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + if (new_bfqq) { + /* + * bic still points to bfqq, then it has not yet been + * redirected to some other bfq_queue, and a queue + * merge beween bfqq and new_bfqq can be safely + * fulfillled, i.e., bic can be redirected to new_bfqq + * and bfqq can be put. + */ + bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, + new_bfqq); + /* + * If we get here, bio will be queued into new_queue, + * so use new_bfqq to decide whether bio and rq can be + * merged. + */ + bfqq = new_bfqq; + /* + * Change also bqfd->bio_bfqq, as + * bfqd->bio_bic now points to new_bfqq, and + * this function may be invoked again (and then may + * use again bqfd->bio_bfqq). + */ + bfqd->bio_bfqq = bfqq; + } return bfqq == RQ_BFQQ(rq); } @@ -3965,14 +3986,43 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) return rq; } +/* + * Next two functions release bfqd->lock and put the io context + * pointed by bfqd->ioc_to_put. This delayed put is used to not risk + * to take an ioc->lock while the scheduler lock is being held. + */ +static void bfq_unlock_put_ioc(struct bfq_data *bfqd) +{ + struct io_context *ioc_to_put = bfqd->ioc_to_put; + + bfqd->ioc_to_put = NULL; + spin_unlock_irq(&bfqd->lock); + + if (ioc_to_put) + put_io_context(ioc_to_put); +} + +static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, + unsigned long flags) +{ + struct io_context *ioc_to_put = bfqd->ioc_to_put; + + bfqd->ioc_to_put = NULL; + spin_unlock_irqrestore(&bfqd->lock, flags); + + if (ioc_to_put) + put_io_context(ioc_to_put); +} + static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct request *rq; spin_lock_irq(&bfqd->lock); + rq = __bfq_dispatch_request(hctx); - spin_unlock_irq(&bfqd->lock); + bfq_unlock_put_ioc(bfqd); return rq; } @@ -3981,7 +4031,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * Task holds one reference to the queue, dropped when task exits. Each rq * in-flight on this queue also holds a reference, dropped when rq is freed. * - * Queue lock must be held here. Recall not to use bfqq after calling + * Scheduler lock must be held here. Recall not to use bfqq after calling * this function on it. */ static void bfq_put_queue(struct bfq_queue *bfqq) @@ -4066,17 +4116,23 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ if (bfqq && bfqd) { - spin_lock_irq(&bfqd->lock); + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); /* - * If the bic is using a shared queue, put the reference - * taken on the io_context when the bic started using a - * shared bfq_queue. + * If the bic is using a shared queue, put the + * reference taken on the io_context when the bic + * started using a shared bfq_queue. This put cannot + * make ioc->ref_count reach 0, then no ioc->lock + * risks to be taken (leading to possible deadlock + * scenarios). */ if (is_sync && bfq_bfqq_coop(bfqq)) put_io_context(bic->icq.ioc); + bfq_exit_bfqq(bfqd, bfqq); bic_set_bfqq(bic, NULL, is_sync); - spin_unlock_irq(&bfqd->lock); + bfq_unlock_put_ioc_restore(bfqd, flags); } } @@ -4183,8 +4239,6 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, INIT_HLIST_NODE(&bfqq->burst_list_node); BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - spin_lock_init(&bfqq->lock); - bfqq->ref = 0; bfqq->bfqd = bfqd; @@ -4476,6 +4530,14 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) new_bfqq->ref++; bfq_clear_bfqq_just_created(bfqq); + /* + * If the bic associated with the process + * issuing this request still points to bfqq + * (and thus has not been already redirected + * to new_bfqq or even some other bfq_queue), + * then complete the merge and redirect it to + * new_bfqq. + */ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); @@ -4498,14 +4560,17 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) } static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) + bool at_head) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) - goto done; + if (blk_mq_sched_try_insert_merge(q, rq)) { + spin_unlock_irq(&bfqd->lock); + return; + } + spin_unlock_irq(&bfqd->lock); blk_mq_sched_request_inserted(rq); @@ -4530,8 +4595,8 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, q->last_merge = rq; } } -done: - spin_unlock_irq(&bfqd->lock); + + bfq_unlock_put_ioc(bfqd); } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, @@ -4724,7 +4789,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) bfq_completed_request(bfqq, bfqd); bfq_put_rq_priv_body(bfqq); - spin_unlock_irqrestore(&bfqd->lock, flags); + bfq_unlock_put_ioc_restore(bfqd, flags); } else { /* * Request rq may be still/already in the scheduler, @@ -4732,10 +4797,10 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) * defer such a check and removal, to avoid * inconsistencies in the time interval from the end * of this function to the start of the deferred work. - * Fortunately, this situation occurs only in process - * context, so taking the scheduler lock does not - * cause any deadlock, even if other locks are already - * (correctly) held by this process. + * This situation seems to occur only in process + * context, as a consequence of a merge. In the + * current version of the code, this implies that the + * lock is held. */ BUG_ON(in_interrupt()); @@ -4758,8 +4823,6 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); - put_io_context(bic->icq.ioc); - if (bfqq_process_refs(bfqq) == 1) { bfqq->pid = current->pid; bfq_clear_bfqq_coop(bfqq); @@ -4775,6 +4838,41 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) return NULL; } +static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bio *bio, + bool split, bool is_sync, + bool *new_queue) +{ + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + + if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) + return bfqq; + + if (new_queue) + *new_queue = true; + + if (bfqq) + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { + if ((bic->was_in_burst_list && bfqd->large_burst) || + bic->saved_in_large_burst) + bfq_mark_bfqq_in_large_burst(bfqq); + else { + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); + } + bfqq->split_time = jiffies; + } + + return bfqq; +} + /* * Allocate bfq data structures associated with this request. */ @@ -4786,6 +4884,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; bool bfqq_already_existing = false, split = false; + bool new_queue = false; spin_lock_irq(&bfqd->lock); @@ -4796,42 +4895,10 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bfq_bic_update_cgroup(bic, bio); -new_queue: - bfqq = bic_to_bfqq(bic, is_sync); - if (!bfqq || bfqq == &bfqd->oom_bfqq) { - if (bfqq) - bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); - BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, + &new_queue); - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { - bfq_log_bfqq(bfqd, bfqq, - "get_request: was_in_list %d " - "was_in_large_burst %d " - "large burst in progress %d", - bic->was_in_burst_list, - bic->saved_in_large_burst, - bfqd->large_burst); - - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) { - bfq_log_bfqq(bfqd, bfqq, - "get_request: marking in " - "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); - } else { - bfq_log_bfqq(bfqd, bfqq, - "get_request: clearing in " - "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } - bfqq->split_time = jiffies; - } - } else { + if (unlikely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); @@ -4841,9 +4908,19 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bic->saved_in_large_burst = true; bfqq = bfq_split_bfqq(bic, bfqq); - split = true; + /* + * A reference to bic->icq.ioc needs to be + * released after a queue split. Do not do it + * immediately, to not risk to possibly take + * an ioc->lock while holding the scheduler + * lock. + */ + bfqd->ioc_to_put = bic->icq.ioc; + if (!bfqq) - goto new_queue; + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, + true, is_sync, + NULL); else bfqq_already_existing = true; } @@ -4861,18 +4938,17 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, /* * If a bfq_queue has only one process reference, it is owned - * by only one bfq_io_cq: we can set the bic field of the - * bfq_queue to the address of that structure. Also, if the - * queue has just been split, mark a flag so that the - * information is available to the other scheduler hooks. + * by only this bic: we can then set bfqq->bic = bic. in + * addition, if the queue has also just been split, we have to + * resume its state. */ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { bfqq->bic = bic; - if (split) { + if (bfqd->ioc_to_put) { /* if true, then there has been a split */ /* - * If the queue has just been split from a shared - * queue, restore the idle window and the possible - * weight raising period. + * The queue has just been split from a shared + * queue: restore the idle window and the + * possible weight raising period. */ bfq_bfqq_resume_state(bfqq, bfqd, bic, bfqq_already_existing); @@ -4882,7 +4958,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, if (unlikely(bfq_bfqq_just_created(bfqq))) bfq_handle_burst(bfqd, bfqq); - spin_unlock_irq(&bfqd->lock); + bfq_unlock_put_ioc(bfqd); return 0; @@ -4929,7 +5005,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) bfq_bfqq_expire(bfqd, bfqq, true, reason); schedule_dispatch: - spin_unlock_irqrestore(&bfqd->lock, flags); + bfq_unlock_put_ioc_restore(bfqd, flags); bfq_schedule_dispatch(bfqd); } diff --git a/block/bfq-mq.h b/block/bfq-mq.h index 23744b246db6..bd83f1c02573 100644 --- a/block/bfq-mq.h +++ b/block/bfq-mq.h @@ -338,8 +338,6 @@ struct bfq_queue { unsigned long wr_start_at_switch_to_srt; unsigned long split_time; /* time of last split */ - - spinlock_t lock; }; /** @@ -609,6 +607,29 @@ struct bfq_data { struct bfq_queue oom_bfqq; spinlock_t lock; + + /* + * bic associated with the task issuing current bio for + * merging. This and the next field are used as a support to + * be able to perform the bic lookup, needed by bio-merge + * functions, before the scheduler lock is taken, and thus + * avoid taking the request-queue lock while the scheduler + * lock is being held. + */ + struct bfq_io_cq *bio_bic; + /* bfqq associated with the task issuing current bio for merging */ + struct bfq_queue *bio_bfqq; + /* Extra flag used only for TESTING */ + bool bio_bfqq_set; + + /* + * io context to put right after bfqd->lock is released. This + * filed is used to perform put_io_context, when needed, to + * after the scheduler lock has been released, and thus + * prevent an ioc->lock from being possibly taken while the + * scheduler lock is being held. + */ + struct io_context *ioc_to_put; }; enum bfqq_state_flags { diff --git a/block/bfq-sched.c b/block/bfq-sched.c index b54a638186e3..a5c8b4acd33c 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -1905,7 +1905,18 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) struct bfq_entity *entity = in_serv_entity; if (bfqd->in_service_bic) { +#ifdef BFQ_MQ + /* + * Schedule the release of a reference to + * bfqd->in_service_bic->icq.ioc to right after the + * scheduler lock is released. This ioc is not + * released immediately, to not risk to possibly take + * an ioc->lock while holding the scheduler lock. + */ + bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc; +#else put_io_context(bfqd->in_service_bic->icq.ioc); +#endif bfqd->in_service_bic = NULL; } From 84cc7140cb4f0574710625f51abbb076a1dd2920 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 3 Mar 2017 09:31:14 +0100 Subject: [PATCH 17/51] Add checks and extra log messages - Part II Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 42 ++++++++++++++++++++++++++++++++++++++++-- block/bfq-sched.c | 1 + 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 69ef3761c95d..5707d42b160d 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -1567,6 +1567,7 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, { struct bfq_queue *bfqq = bfqd->bio_bfqq; + BUG_ON(!bfqd->bio_bfqq_set); if (bfqq) return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); @@ -1719,6 +1720,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) if (free) blk_mq_free_request(free); bfqd->bio_bfqq_set = false; + BUG_ON(bfqd->ioc_to_put); spin_unlock_irq(&bfqd->lock); return ret; @@ -1781,6 +1783,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, bfq_updated_next_req(bfqd, bfqq); bfq_pos_tree_add_move(bfqd, bfqq); } + BUG_ON(bfqd->ioc_to_put); spin_unlock_irq(&bfqd->lock); } } @@ -1824,6 +1827,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfq_remove_request(q, next); + BUG_ON(bfqq->bfqd->ioc_to_put); spin_unlock_irq(&bfqq->bfqd->lock); end: bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); @@ -2195,9 +2199,11 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, { bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", (unsigned long) new_bfqq->pid); + BUG_ON(bfqq->bic && bfqq->bic == new_bfqq->bic); /* Save weight raising and idle window of the merged queues */ bfq_bfqq_save_state(bfqq); bfq_bfqq_save_state(new_bfqq); + if (bfq_bfqq_IO_bound(bfqq)) bfq_mark_bfqq_IO_bound(new_bfqq); bfq_clear_bfqq_IO_bound(bfqq); @@ -2276,6 +2282,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, bool is_sync = op_is_sync(bio->bi_opf); struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; + assert_spin_locked(&bfqd->lock); /* * Disallow merge of a sync bio into an async request. */ @@ -2286,6 +2293,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * Lookup the bfqq that this bio will be queued with. Allow * merge only if rq is queued there. */ + BUG_ON(!bfqd->bio_bfqq_set); if (!bfqq) return false; @@ -2294,6 +2302,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * of the queues of possible cooperating processes. */ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + BUG_ON(new_bfqq == bfqq); if (new_bfqq) { /* * bic still points to bfqq, then it has not yet been @@ -4040,6 +4049,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq) struct bfq_group *bfqg = bfqq_group(bfqq); #endif + assert_spin_locked(&bfqq->bfqd->lock); + BUG_ON(bfqq->ref <= 0); if (bfqq->bfqd) @@ -4119,6 +4130,7 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); + BUG_ON(bfqd->ioc_to_put); /* * If the bic is using a shared queue, put the * reference taken on the io_context when the bic @@ -4567,10 +4579,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { + BUG_ON(bfqd->ioc_to_put); spin_unlock_irq(&bfqd->lock); return; } + BUG_ON(bfqd->ioc_to_put); spin_unlock_irq(&bfqd->lock); blk_mq_sched_request_inserted(rq); @@ -4785,6 +4799,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); + BUG_ON(bfqd->ioc_to_put); bfq_completed_request(bfqq, bfqd); bfq_put_rq_priv_body(bfqq); @@ -4855,13 +4870,28 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, if (bfqq) bfq_put_queue(bfqq); bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); bic_set_bfqq(bic, bfqq, is_sync); if (split && is_sync) { + bfq_log_bfqq(bfqd, bfqq, + "get_request: was_in_list %d " + "was_in_large_burst %d " + "large burst in progress %d", + bic->was_in_burst_list, + bic->saved_in_large_burst, + bfqd->large_burst); + if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) + bic->saved_in_large_burst) { + bfq_log_bfqq(bfqd, bfqq, + "get_request: marking in " + "large burst"); bfq_mark_bfqq_in_large_burst(bfqq); - else { + } else { + bfq_log_bfqq(bfqd, bfqq, + "get_request: clearing in " + "large burst"); bfq_clear_bfqq_in_large_burst(bfqq); if (bic->was_in_burst_list) hlist_add_head(&bfqq->burst_list_node, @@ -4897,10 +4927,12 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, &new_queue); + BUG_ON(bfqd->ioc_to_put); if (unlikely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + BUG_ON(!is_sync); bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); /* Update bic before losing reference to bfqq */ @@ -4923,6 +4955,9 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, NULL); else bfqq_already_existing = true; + + BUG_ON(!bfqq); + BUG_ON(bfqq == &bfqd->oom_bfqq); } } @@ -4976,6 +5011,8 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) BUG_ON(!bfqd); spin_lock_irqsave(&bfqd->lock, flags); + BUG_ON(bfqd->ioc_to_put); + bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); bfq_clear_bfqq_wait_request(bfqq); @@ -5083,6 +5120,7 @@ static void bfq_exit_queue(struct elevator_queue *e) spin_lock_irq(&bfqd->lock); list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) bfq_deactivate_bfqq(bfqd, bfqq, false, false); + BUG_ON(bfqd->ioc_to_put); spin_unlock_irq(&bfqd->lock); hrtimer_cancel(&bfqd->idle_slice_timer); diff --git a/block/bfq-sched.c b/block/bfq-sched.c index a5c8b4acd33c..85e59eeb3569 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -1906,6 +1906,7 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) if (bfqd->in_service_bic) { #ifdef BFQ_MQ + BUG_ON(bfqd->ioc_to_put); /* * Schedule the release of a reference to * bfqd->in_service_bic->icq.ioc to right after the From 3d54cb804f1db2e08ce4a6cc335868538542f587 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 22 Feb 2017 11:30:01 +0100 Subject: [PATCH 18/51] Fix unbalanced increment of rq_in_driver Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 5707d42b160d..9cbcb8d43d81 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -3936,9 +3936,45 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = list_first_entry(&bfqd->dispatch, struct request, queuelist); list_del_init(&rq->queuelist); + bfq_log(bfqd, "dispatch requests: picked %p from dispatch list", rq); - goto exit; + bfqq = RQ_BFQQ(rq); + + if (bfqq) { + /* + * Increment counters here, because this + * dispatch does not follow the standard + * dispatch flow (where counters are + * incremented) + */ + bfqq->dispatched++; + + goto inc_in_driver_start_rq; + } + + /* + * We exploit the put_rq_private hook to decrement + * rq_in_driver, but put_rq_private will not be + * invoked on this request. So, to avoid unbalance, + * just start this request, without incrementing + * rq_in_driver. As a negative consequence, + * rq_in_driver is deceptively lower than it should be + * while this request is in service. This may cause + * bfq_schedule_dispatch to be invoked uselessly. + * + * As for implementing an exact solution, the + * put_request hook, if defined, is probably invoked + * also on this request. So, by exploiting this hook, + * we could 1) increment rq_in_driver here, and 2) + * decrement it in put_request. Such a solution would + * let the value of the counter be always accurate, + * but it would entail using an extra interface + * function. This cost seems higher than the benefit, + * being the frequency of non-elevator-private + * requests very low. + */ + goto start_rq; } bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); @@ -3973,10 +4009,12 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) BUG_ON(bfqq->next_rq == NULL && bfqq->entity.budget < bfqq->entity.service); -exit: + if (rq) { - rq->rq_flags |= RQF_STARTED; + inc_in_driver_start_rq: bfqd->rq_in_driver++; + start_rq: + rq->rq_flags |= RQF_STARTED; if (bfqq) bfq_log_bfqq(bfqd, bfqq, "dispatched %s request %p, rq_in_driver %d", @@ -3992,6 +4030,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) "returned NULL request, rq_in_driver %d", bfqd->rq_in_driver); +exit: return rq; } @@ -4591,15 +4630,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); if (at_head || blk_rq_is_passthrough(rq)) { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else list_add_tail(&rq->queuelist, &bfqd->dispatch); - - if (bfqq) - bfqq->dispatched++; } else { __bfq_insert_request(bfqd, rq); @@ -4966,7 +5000,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, "get_request: new allocated %d", bfqq->allocated); bfqq->ref++; - bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref); + bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); rq->elv.priv[0] = bic; rq->elv.priv[1] = bfqq; From 7ba977d696b239569b4cd233aebc99e136ecf487 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 3 Mar 2017 09:39:35 +0100 Subject: [PATCH 19/51] Add checks and extra log messages - Part III Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 9cbcb8d43d81..24b529a2edc7 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4630,10 +4630,21 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); if (at_head || blk_rq_is_passthrough(rq)) { + struct bfq_queue *bfqq = RQ_BFQQ(rq); + if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else list_add_tail(&rq->queuelist, &bfqd->dispatch); + + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, + "insert_request %p in disp: at_head %d", + rq, at_head); + else + bfq_log(bfqd, + "insert_request %p in disp: at_head %d", + rq, at_head); } else { __bfq_insert_request(bfqd, rq); From c94e47b2908600b8ba89f84b0ac7febddd313141 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 17 Feb 2017 14:28:02 +0100 Subject: [PATCH 20/51] TESTING: Check wrong invocation of merge and put_rq_priv functions Check that merge functions are not invoked on requests queued in the dispatch queue, and that neither put_rq_private is invoked on these requests if, in addition, they have not passed through get_rq_private. Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 22 ++++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 24 insertions(+) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 24b529a2edc7..b4d40bb712d2 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -1746,6 +1746,8 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, static void bfq_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { + BUG_ON(req->rq_flags & RQF_DISP_LIST); + if (type == ELEVATOR_FRONT_MERGE && rb_prev(&req->rb_node) && blk_rq_pos(req) < @@ -1795,6 +1797,8 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, BUG_ON(!RQ_BFQQ(rq)); BUG_ON(!RQ_BFQQ(next)); + BUG_ON(rq->rq_flags & RQF_DISP_LIST); + BUG_ON(next->rq_flags & RQF_DISP_LIST); if (!RB_EMPTY_NODE(&rq->rb_node)) goto end; @@ -3936,6 +3940,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) rq = list_first_entry(&bfqd->dispatch, struct request, queuelist); list_del_init(&rq->queuelist); + rq->rq_flags &= ~RQF_DISP_LIST; bfq_log(bfqd, "dispatch requests: picked %p from dispatch list", rq); @@ -3950,6 +3955,17 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) */ bfqq->dispatched++; + /* + * TESTING: reset DISP_LIST flag, because: 1) + * this rq this request has passed through + * get_rq_private, 2) then it will have + * put_rq_private invoked on it, and 3) in + * put_rq_private we use this flag to check + * that put_rq_private is not invoked on + * requests for which get_rq_private has been + * invoked. + */ + rq->rq_flags &= ~RQF_DISP_LIST; goto inc_in_driver_start_rq; } @@ -4637,6 +4653,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, else list_add_tail(&rq->queuelist, &bfqd->dispatch); + rq->rq_flags |= RQF_DISP_LIST; if (bfqq) bfq_log_bfqq(bfqd, bfqq, "insert_request %p in disp: at_head %d", @@ -4824,6 +4841,10 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) bfqd = bfqq->bfqd; BUG_ON(!bfqd); + if (rq->rq_flags & RQF_DISP_LIST) { + pr_crit("putting disp rq %p for %d", rq, bfqq->pid); + BUG(); + } BUG_ON(rq->rq_flags & RQF_QUEUED); BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); @@ -5015,6 +5036,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, rq->elv.priv[0] = bic; rq->elv.priv[1] = bfqq; + rq->rq_flags &= ~RQF_DISP_LIST; /* * If a bfq_queue has only one process reference, it is owned diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 10f892ca585d..0048e59e6d07 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -121,6 +121,8 @@ typedef __u32 __bitwise req_flags_t; /* Look at ->special_vec for the actual data payload instead of the bio chain. */ #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) +/* DEBUG: rq in bfq-mq dispatch list */ +#define RQF_DISP_LIST ((__force req_flags_t)(1 << 19)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ From 49206f9052d13c96d49dbc36c612bed41b2d6552 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 25 Feb 2017 17:38:05 +0100 Subject: [PATCH 21/51] Complete support for cgroups This commit completes cgroups support for bfq-mq. In particular, it deals with a sort of circular dependency introduced in blk-mq: the function blkcg_activate_policy, invoked during scheduler initialization, triggers the invocation of the has_work scheduler hook (before the init function is finished). To adress this issue, this commit moves the invocation of blkcg_activate_policy after the initialization of all the fields that could be initialized before invoking blkcg_activate_policy itself. This enables has_work to correctly return false, and thus to prevent the blk-mq stack from invoking further scheduler hooks before the init function is finished. Signed-off-by: Paolo Valente --- block/Kconfig.iosched | 9 +++++ block/bfq-mq-iosched.c | 108 ++++++++++++++++++++++++++++--------------------- block/bfq-mq.h | 2 +- 3 files changed, 72 insertions(+), 47 deletions(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 2d94af3d8b0a..299a6861fb90 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -106,6 +106,15 @@ config MQ_IOSCHED_BFQ guarantees a low latency to interactive and soft real-time applications. Details in Documentation/block/bfq-iosched.txt +config MQ_BFQ_GROUP_IOSCHED + bool "BFQ-MQ hierarchical scheduling support" + depends on MQ_IOSCHED_BFQ && BLK_CGROUP + default n + ---help--- + + Enable hierarchical scheduling in BFQ-MQ, using the blkio + (cgroups-v1) or io (cgroups-v2) controller. + config MQ_IOSCHED_DEADLINE tristate "MQ deadline I/O scheduler" default y diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index b4d40bb712d2..02a1e7fd0ea4 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -88,7 +88,6 @@ #include "blk-mq.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" -#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */ #include "bfq-mq.h" /* Expiration time of sync (0) and async (1) requests, in ns. */ @@ -233,15 +232,6 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, return NULL; } -#define BFQ_MQ -#include "bfq-sched.c" -#include "bfq-cgroup-included.c" - -#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) - -#define bfq_sample_valid(samples) ((samples) > 80) - /* * Scheduler run of queue, if there are requests pending and no one in the * driver that will restart queueing. @@ -255,6 +245,43 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) } /* + * Next two functions release bfqd->lock and put the io context + * pointed by bfqd->ioc_to_put. This delayed put is used to not risk + * to take an ioc->lock while the scheduler lock is being held. + */ +static void bfq_unlock_put_ioc(struct bfq_data *bfqd) +{ + struct io_context *ioc_to_put = bfqd->ioc_to_put; + + bfqd->ioc_to_put = NULL; + spin_unlock_irq(&bfqd->lock); + + if (ioc_to_put) + put_io_context(ioc_to_put); +} + +static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, + unsigned long flags) +{ + struct io_context *ioc_to_put = bfqd->ioc_to_put; + + bfqd->ioc_to_put = NULL; + spin_unlock_irqrestore(&bfqd->lock, flags); + + if (ioc_to_put) + put_io_context(ioc_to_put); +} + +#define BFQ_MQ +#include "bfq-sched.c" +#include "bfq-cgroup-included.c" + +#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) +#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) + +#define bfq_sample_valid(samples) ((samples) > 80) + +/* * Lifted from AS - choose which of rq1 and rq2 that is best served now. * We choose the request that is closesr to the head right now. Distance * behind the head is penalized and only allowed to a certain extent. @@ -4050,34 +4077,6 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) return rq; } -/* - * Next two functions release bfqd->lock and put the io context - * pointed by bfqd->ioc_to_put. This delayed put is used to not risk - * to take an ioc->lock while the scheduler lock is being held. - */ -static void bfq_unlock_put_ioc(struct bfq_data *bfqd) -{ - struct io_context *ioc_to_put = bfqd->ioc_to_put; - - bfqd->ioc_to_put = NULL; - spin_unlock_irq(&bfqd->lock); - - if (ioc_to_put) - put_io_context(ioc_to_put); -} - -static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, - unsigned long flags) -{ - struct io_context *ioc_to_put = bfqd->ioc_to_put; - - bfqd->ioc_to_put = NULL; - spin_unlock_irqrestore(&bfqd->lock, flags); - - if (ioc_to_put) - put_io_context(ioc_to_put); -} - static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; @@ -5239,6 +5238,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) } eq->elevator_data = bfqd; + spin_lock_irq(q->queue_lock); + q->elevator = eq; + spin_unlock_irq(q->queue_lock); + /* * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow @@ -5261,12 +5264,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->oom_bfqq.entity.prio_changed = 1; bfqd->queue = q; - - bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); - if (!bfqd->root_group) - goto out_free; - bfq_init_root_group(bfqd->root_group, bfqd); - bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + INIT_LIST_HEAD(&bfqd->dispatch); hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -5324,9 +5322,27 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->device_speed = BFQ_BFQD_FAST; spin_lock_init(&bfqd->lock); - INIT_LIST_HEAD(&bfqd->dispatch); - q->elevator = eq; + /* + * The invocation of the next bfq_create_group_hierarchy + * function is the head of a chain of function calls + * (bfq_create_group_hierarchy->blkcg_activate_policy-> + * blk_mq_freeze_queue) that may lead to the invocation of the + * has_work hook function. For this reason, + * bfq_create_group_hierarchy is invoked only after all + * scheduler data has been initialized, apart from the fields + * that can be initialized only after invoking + * bfq_create_group_hierarchy. This, in particular, enables + * has_work to correctly return false. Of course, to avoid + * other inconsistencies, the blk-mq stack must then refrain + * from invoking further scheduler hooks before this init + * function is finished. + */ + bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); + if (!bfqd->root_group) + goto out_free; + bfq_init_root_group(bfqd->root_group, bfqd); + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); return 0; diff --git a/block/bfq-mq.h b/block/bfq-mq.h index bd83f1c02573..2c81c02bccc4 100644 --- a/block/bfq-mq.h +++ b/block/bfq-mq.h @@ -20,7 +20,7 @@ #include /* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ -#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED +#ifdef CONFIG_MQ_BFQ_GROUP_IOSCHED #define BFQ_GROUP_IOSCHED_ENABLED #endif From 62d12db23ce14d2716b5cff7d2635fbc817b96d0 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 17 Mar 2017 06:15:18 +0100 Subject: [PATCH 22/51] Remove all get and put of I/O contexts When a bfq queue is set in service and when it is merged, a reference to the I/O context associated with the queue is taken. This reference is then released when the queue is deselected from service or split. More precisely, the release of the reference is postponed to when the scheduler lock is released, to avoid nesting between the scheduler and the I/O-context lock. In fact, such nesting would lead to deadlocks, because of other code paths that take the same locks in the opposite order. This postponing of I/O-context releases does complicate code. This commit addresses this issue by modifying involved operations in such a way to not need to get the above I/O-context references any more. Then it also removes any get and release of these references. Signed-off-by: Paolo Valente --- block/bfq-cgroup-included.c | 2 +- block/bfq-mq-iosched.c | 127 ++++++++------------------------------------ block/bfq-mq.h | 11 ---- block/bfq-sched.c | 17 ------ 4 files changed, 22 insertions(+), 135 deletions(-) diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c index cf59eeb7f08e..dfacca799b5e 100644 --- a/block/bfq-cgroup-included.c +++ b/block/bfq-cgroup-included.c @@ -773,7 +773,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) bfq_put_async_queues(bfqd, bfqg); #ifdef BFQ_MQ - bfq_unlock_put_ioc_restore(bfqd, flags); + spin_unlock_irqrestore(&bfqd->lock, flags); #endif /* * @blkg is going offline and will be ignored by diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 02a1e7fd0ea4..8e7589d3280f 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -244,34 +244,6 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd) } } -/* - * Next two functions release bfqd->lock and put the io context - * pointed by bfqd->ioc_to_put. This delayed put is used to not risk - * to take an ioc->lock while the scheduler lock is being held. - */ -static void bfq_unlock_put_ioc(struct bfq_data *bfqd) -{ - struct io_context *ioc_to_put = bfqd->ioc_to_put; - - bfqd->ioc_to_put = NULL; - spin_unlock_irq(&bfqd->lock); - - if (ioc_to_put) - put_io_context(ioc_to_put); -} - -static void bfq_unlock_put_ioc_restore(struct bfq_data *bfqd, - unsigned long flags) -{ - struct io_context *ioc_to_put = bfqd->ioc_to_put; - - bfqd->ioc_to_put = NULL; - spin_unlock_irqrestore(&bfqd->lock, flags); - - if (ioc_to_put) - put_io_context(ioc_to_put); -} - #define BFQ_MQ #include "bfq-sched.c" #include "bfq-cgroup-included.c" @@ -1747,7 +1719,6 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) if (free) blk_mq_free_request(free); bfqd->bio_bfqq_set = false; - BUG_ON(bfqd->ioc_to_put); spin_unlock_irq(&bfqd->lock); return ret; @@ -1812,7 +1783,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, bfq_updated_next_req(bfqd, bfqq); bfq_pos_tree_add_move(bfqd, bfqq); } - BUG_ON(bfqd->ioc_to_put); spin_unlock_irq(&bfqd->lock); } } @@ -1858,7 +1828,6 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfq_remove_request(q, next); - BUG_ON(bfqq->bfqd->ioc_to_put); spin_unlock_irq(&bfqq->bfqd->lock); end: bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); @@ -2035,20 +2004,18 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * first time that the requests of some process are redirected to * it. * - * We redirect bfqq to new_bfqq and not the opposite, because we - * are in the context of the process owning bfqq, hence we have - * the io_cq of this process. So we can immediately configure this - * io_cq to redirect the requests of the process to new_bfqq. + * We redirect bfqq to new_bfqq and not the opposite, because + * we are in the context of the process owning bfqq, thus we + * have the io_cq of this process. So we can immediately + * configure this io_cq to redirect the requests of the + * process to new_bfqq. In contrast, the io_cq of new_bfqq is + * not available any more (new_bfqq->bic == NULL). * - * NOTE, even if new_bfqq coincides with the in-service queue, the - * io_cq of new_bfqq is not available, because, if the in-service - * queue is shared, bfqd->in_service_bic may not point to the - * io_cq of the in-service queue. - * Redirecting the requests of the process owning bfqq to the - * currently in-service queue is in any case the best option, as - * we feed the in-service queue with new requests close to the - * last request served and, by doing so, hopefully increase the - * throughput. + * Anyway, even in case new_bfqq coincides with the in-service + * queue, redirecting requests the in-service queue is the + * best option, as we feed the in-service queue with new + * requests close to the last request served and, by doing so, + * are likely to increase the throughput. */ bfqq->new_bfqq = new_bfqq; new_bfqq->ref += process_refs; @@ -2147,13 +2114,13 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, in_service_bfqq = bfqd->in_service_queue; if (in_service_bfqq && in_service_bfqq != bfqq && - bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) + wr_from_too_long(in_service_bfqq) && likely(in_service_bfqq == &bfqd->oom_bfqq)) bfq_log_bfqq(bfqd, bfqq, "would have tried merge with in-service-queue, but wr"); - if (!in_service_bfqq || in_service_bfqq == bfqq || - !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || + if (!in_service_bfqq || in_service_bfqq == bfqq + || wr_from_too_long(in_service_bfqq) || unlikely(in_service_bfqq == &bfqd->oom_bfqq)) goto check_scheduled; @@ -2214,16 +2181,6 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); } -static void bfq_get_bic_reference(struct bfq_queue *bfqq) -{ - /* - * If bfqq->bic has a non-NULL value, the bic to which it belongs - * is about to begin using a shared bfq_queue. - */ - if (bfqq->bic) - atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -} - static void bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) @@ -2280,12 +2237,6 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfqd->wr_busy_queues); /* - * Grab a reference to the bic, to prevent it from being destroyed - * before being possibly touched by a bfq_split_bfqq(). - */ - bfq_get_bic_reference(bfqq); - bfq_get_bic_reference(new_bfqq); - /* * Merge queues (that is, let bic redirect its requests to new_bfqq) */ bic_set_bfqq(bic, new_bfqq, 1); @@ -2472,16 +2423,10 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) static void bfq_arm_slice_timer(struct bfq_data *bfqd) { struct bfq_queue *bfqq = bfqd->in_service_queue; - struct bfq_io_cq *bic; u32 sl; BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - /* Processes have exited, don't wait. */ - bic = bfqd->in_service_bic; - if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) - return; - bfq_mark_bfqq_wait_request(bfqq); /* @@ -3922,11 +3867,6 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, bfq_bfqq_budget_left(bfqq), bfqq->dispatched); - if (!bfqd->in_service_bic) { - atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); - bfqd->in_service_bic = RQ_BIC(rq); - } - /* * Expire bfqq, pretending that its budget expired, if bfqq * belongs to CLASS_IDLE and other queues are waiting for @@ -4085,7 +4025,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock_irq(&bfqd->lock); rq = __bfq_dispatch_request(hctx); - bfq_unlock_put_ioc(bfqd); + spin_unlock_irq(&bfqd->lock); return rq; } @@ -4184,21 +4124,10 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); - BUG_ON(bfqd->ioc_to_put); - /* - * If the bic is using a shared queue, put the - * reference taken on the io_context when the bic - * started using a shared bfq_queue. This put cannot - * make ioc->ref_count reach 0, then no ioc->lock - * risks to be taken (leading to possible deadlock - * scenarios). - */ - if (is_sync && bfq_bfqq_coop(bfqq)) - put_io_context(bic->icq.ioc); bfq_exit_bfqq(bfqd, bfqq); bic_set_bfqq(bic, NULL, is_sync); - bfq_unlock_put_ioc_restore(bfqd, flags); + spin_unlock_irqrestore(&bfqd->lock, flags); } } @@ -4633,12 +4562,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { - BUG_ON(bfqd->ioc_to_put); spin_unlock_irq(&bfqd->lock); return; } - BUG_ON(bfqd->ioc_to_put); spin_unlock_irq(&bfqd->lock); blk_mq_sched_request_inserted(rq); @@ -4671,7 +4598,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } - bfq_unlock_put_ioc(bfqd); + spin_unlock_irq(&bfqd->lock); } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, @@ -4864,12 +4791,11 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); - BUG_ON(bfqd->ioc_to_put); bfq_completed_request(bfqq, bfqd); bfq_put_rq_priv_body(bfqq); - bfq_unlock_put_ioc_restore(bfqd, flags); + spin_unlock_irqrestore(&bfqd->lock, flags); } else { /* * Request rq may be still/already in the scheduler, @@ -4992,7 +4918,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, &new_queue); - BUG_ON(bfqd->ioc_to_put); if (unlikely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ @@ -5005,14 +4930,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bic->saved_in_large_burst = true; bfqq = bfq_split_bfqq(bic, bfqq); - /* - * A reference to bic->icq.ioc needs to be - * released after a queue split. Do not do it - * immediately, to not risk to possibly take - * an ioc->lock while holding the scheduler - * lock. - */ - bfqd->ioc_to_put = bic->icq.ioc; if (!bfqq) bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, @@ -5045,7 +4962,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, */ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { bfqq->bic = bic; - if (bfqd->ioc_to_put) { /* if true, then there has been a split */ + if (split) { /* * The queue has just been split from a shared * queue: restore the idle window and the @@ -5059,7 +4976,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, if (unlikely(bfq_bfqq_just_created(bfqq))) bfq_handle_burst(bfqd, bfqq); - bfq_unlock_put_ioc(bfqd); + spin_unlock_irq(&bfqd->lock); return 0; @@ -5077,7 +4994,6 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) BUG_ON(!bfqd); spin_lock_irqsave(&bfqd->lock, flags); - BUG_ON(bfqd->ioc_to_put); bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); bfq_clear_bfqq_wait_request(bfqq); @@ -5108,7 +5024,7 @@ static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) bfq_bfqq_expire(bfqd, bfqq, true, reason); schedule_dispatch: - bfq_unlock_put_ioc_restore(bfqd, flags); + spin_unlock_irqrestore(&bfqd->lock, flags); bfq_schedule_dispatch(bfqd); } @@ -5186,7 +5102,6 @@ static void bfq_exit_queue(struct elevator_queue *e) spin_lock_irq(&bfqd->lock); list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) bfq_deactivate_bfqq(bfqd, bfqq, false, false); - BUG_ON(bfqd->ioc_to_put); spin_unlock_irq(&bfqd->lock); hrtimer_cancel(&bfqd->idle_slice_timer); diff --git a/block/bfq-mq.h b/block/bfq-mq.h index 2c81c02bccc4..36ee24a87dda 100644 --- a/block/bfq-mq.h +++ b/block/bfq-mq.h @@ -458,8 +458,6 @@ struct bfq_data { /* bfq_queue in service */ struct bfq_queue *in_service_queue; - /* bfq_io_cq (bic) associated with the @in_service_queue */ - struct bfq_io_cq *in_service_bic; /* on-disk position of the last served request */ sector_t last_position; @@ -621,15 +619,6 @@ struct bfq_data { struct bfq_queue *bio_bfqq; /* Extra flag used only for TESTING */ bool bio_bfqq_set; - - /* - * io context to put right after bfqd->lock is released. This - * filed is used to perform put_io_context, when needed, to - * after the scheduler lock has been released, and thus - * prevent an ioc->lock from being possibly taken while the - * scheduler lock is being held. - */ - struct io_context *ioc_to_put; }; enum bfqq_state_flags { diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 85e59eeb3569..9c4e6797d8c9 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -1904,23 +1904,6 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; struct bfq_entity *entity = in_serv_entity; - if (bfqd->in_service_bic) { -#ifdef BFQ_MQ - BUG_ON(bfqd->ioc_to_put); - /* - * Schedule the release of a reference to - * bfqd->in_service_bic->icq.ioc to right after the - * scheduler lock is released. This ioc is not - * released immediately, to not risk to possibly take - * an ioc->lock while holding the scheduler lock. - */ - bfqd->ioc_to_put = bfqd->in_service_bic->icq.ioc; -#else - put_io_context(bfqd->in_service_bic->icq.ioc); -#endif - bfqd->in_service_bic = NULL; - } - bfq_clear_bfqq_wait_request(in_serv_bfqq); hrtimer_try_to_cancel(&bfqd->idle_slice_timer); bfqd->in_service_queue = NULL; From 1521ad11f8684cf0a1b7249249cd406fee50da6d Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 29 Mar 2017 18:41:46 +0200 Subject: [PATCH 23/51] BUGFIX: Remove unneeded and deadlock-causing lock in request_merged Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 8e7589d3280f..bb046335ff4f 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -1761,7 +1761,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, BUG_ON(RQ_BFQQ(req) != bfqq); elv_rb_add(&bfqq->sort_list, req); - spin_lock_irq(&bfqd->lock); /* Choose next request to be served for bfqq */ prev = bfqq->next_rq; next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, @@ -1783,7 +1782,6 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, bfq_updated_next_req(bfqd, bfqq); bfq_pos_tree_add_move(bfqd, bfqq); } - spin_unlock_irq(&bfqd->lock); } } From 9136b4c953918ea937254c57cfb787b55b5bc2c6 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 29 Mar 2017 18:55:30 +0200 Subject: [PATCH 24/51] Fix wrong unlikely Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index bb046335ff4f..3ae9bd424b3f 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4917,7 +4917,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, &new_queue); - if (unlikely(!new_queue)) { + if (likely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { BUG_ON(!is_sync); From 8e05f722f19645f2278f6962368ca3b7c2a81c9c Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 12 May 2017 09:51:18 +0200 Subject: [PATCH 25/51] Change cgroup params prefix to bfq-mq for bfq-mq Signed-off-by: Paolo Valente --- block/bfq-cgroup-included.c | 54 ++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c index dfacca799b5e..9e9b0a09e26f 100644 --- a/block/bfq-cgroup-included.c +++ b/block/bfq-cgroup-included.c @@ -995,9 +995,15 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) return blkg_to_bfqg(bfqd->queue->root_blkg); } +#ifdef BFQ_MQ +#define BFQ_CGROUP_FNAME(param) "bfq-mq."#param +#else +#define BFQ_CGROUP_FNAME(param) "bfq."#param +#endif + static struct cftype bfq_blkcg_legacy_files[] = { { - .name = "bfq.weight", + .name = BFQ_CGROUP_FNAME(weight), .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight, .write_u64 = bfq_io_set_weight_legacy, @@ -1005,106 +1011,106 @@ static struct cftype bfq_blkcg_legacy_files[] = { /* statistics, covers only the tasks in the bfqg */ { - .name = "bfq.time", + .name = BFQ_CGROUP_FNAME(time), .private = offsetof(struct bfq_group, stats.time), .seq_show = bfqg_print_stat, }, { - .name = "bfq.sectors", + .name = BFQ_CGROUP_FNAME(sectors), .seq_show = bfqg_print_stat_sectors, }, { - .name = "bfq.io_service_bytes", + .name = BFQ_CGROUP_FNAME(io_service_bytes), .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_bytes, }, { - .name = "bfq.io_serviced", + .name = BFQ_CGROUP_FNAME(io_serviced), .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios, }, { - .name = "bfq.io_service_time", + .name = BFQ_CGROUP_FNAME(io_service_time), .private = offsetof(struct bfq_group, stats.service_time), .seq_show = bfqg_print_rwstat, }, { - .name = "bfq.io_wait_time", + .name = BFQ_CGROUP_FNAME(io_wait_time), .private = offsetof(struct bfq_group, stats.wait_time), .seq_show = bfqg_print_rwstat, }, { - .name = "bfq.io_merged", + .name = BFQ_CGROUP_FNAME(io_merged), .private = offsetof(struct bfq_group, stats.merged), .seq_show = bfqg_print_rwstat, }, { - .name = "bfq.io_queued", + .name = BFQ_CGROUP_FNAME(io_queued), .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat, }, /* the same statictics which cover the bfqg and its descendants */ { - .name = "bfq.time_recursive", + .name = BFQ_CGROUP_FNAME(time_recursive), .private = offsetof(struct bfq_group, stats.time), .seq_show = bfqg_print_stat_recursive, }, { - .name = "bfq.sectors_recursive", + .name = BFQ_CGROUP_FNAME(sectors_recursive), .seq_show = bfqg_print_stat_sectors_recursive, }, { - .name = "bfq.io_service_bytes_recursive", + .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_bytes_recursive, }, { - .name = "bfq.io_serviced_recursive", + .name = BFQ_CGROUP_FNAME(io_serviced_recursive), .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios_recursive, }, { - .name = "bfq.io_service_time_recursive", + .name = BFQ_CGROUP_FNAME(io_service_time_recursive), .private = offsetof(struct bfq_group, stats.service_time), .seq_show = bfqg_print_rwstat_recursive, }, { - .name = "bfq.io_wait_time_recursive", + .name = BFQ_CGROUP_FNAME(io_wait_time_recursive), .private = offsetof(struct bfq_group, stats.wait_time), .seq_show = bfqg_print_rwstat_recursive, }, { - .name = "bfq.io_merged_recursive", + .name = BFQ_CGROUP_FNAME(io_merged_recursive), .private = offsetof(struct bfq_group, stats.merged), .seq_show = bfqg_print_rwstat_recursive, }, { - .name = "bfq.io_queued_recursive", + .name = BFQ_CGROUP_FNAME(io_queued_recursive), .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat_recursive, }, { - .name = "bfq.avg_queue_size", + .name = BFQ_CGROUP_FNAME(avg_queue_size), .seq_show = bfqg_print_avg_queue_size, }, { - .name = "bfq.group_wait_time", + .name = BFQ_CGROUP_FNAME(group_wait_time), .private = offsetof(struct bfq_group, stats.group_wait_time), .seq_show = bfqg_print_stat, }, { - .name = "bfq.idle_time", + .name = BFQ_CGROUP_FNAME(idle_time), .private = offsetof(struct bfq_group, stats.idle_time), .seq_show = bfqg_print_stat, }, { - .name = "bfq.empty_time", + .name = BFQ_CGROUP_FNAME(empty_time), .private = offsetof(struct bfq_group, stats.empty_time), .seq_show = bfqg_print_stat, }, { - .name = "bfq.dequeue", + .name = BFQ_CGROUP_FNAME(dequeue), .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, @@ -1113,7 +1119,7 @@ static struct cftype bfq_blkcg_legacy_files[] = { static struct cftype bfq_blkg_files[] = { { - .name = "bfq.weight", + .name = BFQ_CGROUP_FNAME(weight), .flags = CFTYPE_NOT_ON_ROOT, .seq_show = bfq_io_show_weight, .write = bfq_io_set_weight, @@ -1121,6 +1127,8 @@ static struct cftype bfq_blkg_files[] = { {} /* terminate */ }; +#undef BFQ_CGROUP_FNAME + #else /* BFQ_GROUP_IOSCHED_ENABLED */ static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, From abdf7565dadbb00e78be5f4fb2cc9b157649840e Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 12 May 2017 11:56:13 +0200 Subject: [PATCH 26/51] Add tentative extra tests on groups, reqs and queues Signed-off-by: Paolo Valente --- block/bfq-cgroup-included.c | 1 + block/bfq-mq-iosched.c | 5 +++++ include/linux/blkdev.h | 2 ++ 3 files changed, 8 insertions(+) diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c index 9e9b0a09e26f..72107ad12220 100644 --- a/block/bfq-cgroup-included.c +++ b/block/bfq-cgroup-included.c @@ -412,6 +412,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) BUG_ON(!blkg); bfqg = blkg_to_bfqg(blkg); bfqd = blkg->q->elevator->elevator_data; + BUG_ON(bfqg == bfqd->root_group); entity = &bfqg->entity; d = blkcg_to_bfqgd(blkg->blkcg); diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 3ae9bd424b3f..a9e3406fef06 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4494,6 +4494,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + BUG_ON(!bfqq); assert_spin_locked(&bfqd->lock); @@ -4587,6 +4588,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, "insert_request %p in disp: at_head %d", rq, at_head); } else { + BUG_ON(!(rq->rq_flags & RQF_GOT)); + rq->rq_flags &= ~RQF_GOT; + __bfq_insert_request(bfqd, rq); if (rq_mergeable(rq)) { @@ -4974,6 +4978,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, if (unlikely(bfq_bfqq_just_created(bfqq))) bfq_handle_burst(bfqd, bfqq); + rq->rq_flags |= RQF_GOT; spin_unlock_irq(&bfqd->lock); return 0; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0048e59e6d07..9ae814743095 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -123,6 +123,8 @@ typedef __u32 __bitwise req_flags_t; #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* DEBUG: rq in bfq-mq dispatch list */ #define RQF_DISP_LIST ((__force req_flags_t)(1 << 19)) +/* DEBUG: rq had get_rq_private executed on it */ +#define RQF_GOT ((__force req_flags_t)(1 << 20)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ From 9e1c1514bc947c4e04502331372b1cc58459d8d1 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 15 May 2017 22:25:03 +0200 Subject: [PATCH 27/51] block, bfq-mq: access and cache blkg data only when safe In blk-cgroup, operations on blkg objects are protected with the request_queue lock. This is no more the lock that protects I/O-scheduler operations in blk-mq. In fact, the latter are now protected with a finer-grained per-scheduler-instance lock. As a consequence, although blkg lookups are also rcu-protected, blk-mq I/O schedulers may see inconsistent data when they access blkg and blkg-related objects. BFQ does access these objects, and does incur this problem, in the following case. The blkg_lookup performed in bfq_get_queue, being protected (only) through rcu, may happen to return the address of a copy of the original blkg. If this is the case, then the blkg_get performed in bfq_get_queue, to pin down the blkg, is useless: it does not prevent blk-cgroup code from destroying both the original blkg and all objects directly or indirectly referred by the copy of the blkg. BFQ accesses these objects, which typically causes a crash for NULL-pointer dereference of memory-protection violation. Some additional protection mechanism should be added to blk-cgroup to address this issue. In the meantime, this commit provides a quick temporary fix for BFQ: cache (when safe) blkg data that might disappear right after a blkg_lookup. In particular, this commit exploits the following facts to achieve its goal without introducing further locks. Destroy operations on a blkg invoke, as a first step, hooks of the scheduler associated with the blkg. And these hooks are executed with bfqd->lock held for BFQ. As a consequence, for any blkg associated with the request queue an instance of BFQ is attached to, we are guaranteed that such a blkg is not destroyed, and that all the pointers it contains are consistent, while that instance is holding its bfqd->lock. A blkg_lookup performed with bfqd->lock held then returns a fully consistent blkg, which remains consistent until this lock is held. In more detail, this holds even if the returned blkg is a copy of the original one. Finally, also the object describing a group inside BFQ needs to be protected from destruction on the blkg_free of the original blkg (which invokes bfq_pd_free). This commit adds private refcounting for this object, to let it disappear only after no bfq_queue refers to it any longer. This commit also removes or updates some stale comments on locking issues related to blk-cgroup operations. Reported-by: Tomas Konir Reported-by: Lee Tibbert Reported-by: Marco Piazza Signed-off-by: Paolo Valente Tested-by: Tomas Konir Tested-by: Lee Tibbert Tested-by: Marco Piazza --- block/bfq-cgroup-included.c | 149 ++++++++++++++++++++++++++++++++++++++++---- block/bfq-mq-iosched.c | 2 +- block/bfq-mq.h | 26 +++----- 3 files changed, 148 insertions(+), 29 deletions(-) diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c index 72107ad12220..d903393ee78a 100644 --- a/block/bfq-cgroup-included.c +++ b/block/bfq-cgroup-included.c @@ -43,7 +43,11 @@ BFQG_FLAG_FNS(idling) BFQG_FLAG_FNS(empty) #undef BFQG_FLAG_FNS +#ifdef BFQ_MQ +/* This should be called with the scheduler lock held. */ +#else /* This should be called with the queue_lock held. */ +#endif static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) { unsigned long long now; @@ -58,7 +62,11 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) bfqg_stats_clear_waiting(stats); } +#ifdef BFQ_MQ +/* This should be called with the scheduler lock held. */ +#else /* This should be called with the queue_lock held. */ +#endif static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, struct bfq_group *curr_bfqg) { @@ -72,7 +80,11 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, bfqg_stats_mark_waiting(stats); } +#ifdef BFQ_MQ +/* This should be called with the scheduler lock held. */ +#else /* This should be called with the queue_lock held. */ +#endif static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { unsigned long long now; @@ -198,14 +210,43 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) static void bfqg_get(struct bfq_group *bfqg) { - return blkg_get(bfqg_to_blkg(bfqg)); +#ifdef BFQ_MQ + bfqg->ref++; +#else + blkg_get(bfqg_to_blkg(bfqg)); +#endif } static void bfqg_put(struct bfq_group *bfqg) { - return blkg_put(bfqg_to_blkg(bfqg)); +#ifdef BFQ_MQ + bfqg->ref--; + + BUG_ON(bfqg->ref < 0); + if (bfqg->ref == 0) + kfree(bfqg); +#else + blkg_put(bfqg_to_blkg(bfqg)); +#endif +} + +#ifdef BFQ_MQ +static void bfqg_and_blkg_get(struct bfq_group *bfqg) +{ + /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ + bfqg_get(bfqg); + + blkg_get(bfqg_to_blkg(bfqg)); } +static void bfqg_and_blkg_put(struct bfq_group *bfqg) +{ + bfqg_put(bfqg); + + blkg_put(bfqg_to_blkg(bfqg)); +} +#endif + static void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, unsigned int op) @@ -310,7 +351,15 @@ static void bfq_init_entity(struct bfq_entity *entity, if (bfqq) { bfqq->ioprio = bfqq->new_ioprio; bfqq->ioprio_class = bfqq->new_ioprio_class; +#ifdef BFQ_MQ + /* + * Make sure that bfqg and its associated blkg do not + * disappear before entity. + */ + bfqg_and_blkg_get(bfqg); +#else bfqg_get(bfqg); +#endif } entity->parent = bfqg->my_entity; /* NULL for root group */ entity->sched_data = &bfqg->sched_data; @@ -397,6 +446,10 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) return NULL; } +#ifdef BFQ_MQ + /* see comments in bfq_bic_update_cgroup for why refcounting */ + bfqg_get(bfqg); +#endif return &bfqg->pd; } @@ -432,7 +485,11 @@ static void bfq_pd_free(struct blkg_policy_data *pd) struct bfq_group *bfqg = pd_to_bfqg(pd); bfqg_stats_exit(&bfqg->stats); - return kfree(bfqg); +#ifdef BFQ_MQ + bfqg_put(bfqg); +#else + kfree(bfqg); +#endif } static void bfq_pd_reset_stats(struct blkg_policy_data *pd) @@ -516,9 +573,16 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, * Move @bfqq to @bfqg, deactivating it from its old group and reactivating * it on the new one. Avoid putting the entity on the old group idle tree. * +#ifdef BFQ_MQ + * Must be called under the scheduler lock, to make sure that the blkg + * owning @bfqg does not disappear (see comments in + * bfq_bic_update_cgroup on guaranteeing the consistency of blkg + * objects). +#else * Must be called under the queue lock; the cgroup owning @bfqg must * not disappear (by now this just means that we are called under * rcu_read_lock()). +#endif */ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) @@ -555,16 +619,20 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, entity->tree); bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); } +#ifdef BFQ_MQ + bfqg_and_blkg_put(bfqq_group(bfqq)); +#else bfqg_put(bfqq_group(bfqq)); +#endif - /* - * Here we use a reference to bfqg. We don't need a refcounter - * as the cgroup reference will not be dropped, so that its - * destroy() callback will not be invoked. - */ entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; +#ifdef BFQ_MQ + /* pin down bfqg and its associated blkg */ + bfqg_and_blkg_get(bfqg); +#else bfqg_get(bfqg); +#endif BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); if (bfq_bfqq_busy(bfqq)) { @@ -585,8 +653,14 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, * @bic: the bic to move. * @blkcg: the blk-cgroup to move to. * +#ifdef BFQ_MQ + * Move bic to blkcg, assuming that bfqd->lock is held; which makes + * sure that the reference to cgroup is valid across the call (see + * comments in bfq_bic_update_cgroup on this issue) +#else * Move bic to blkcg, assuming that bfqd->queue is locked; the caller * has to make sure that the reference to cgroup is valid across the call. +#endif * * NOTE: an alternative approach might have been to store the current * cgroup in bfqq and getting a reference to it, reducing the lookup @@ -645,6 +719,59 @@ static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) goto out; bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); +#ifdef BFQ_MQ + /* + * Update blkg_path for bfq_log_* functions. We cache this + * path, and update it here, for the following + * reasons. Operations on blkg objects in blk-cgroup are + * protected with the request_queue lock, and not with the + * lock that protects the instances of this scheduler + * (bfqd->lock). This exposes BFQ to the following sort of + * race. + * + * The blkg_lookup performed in bfq_get_queue, protected + * through rcu, may happen to return the address of a copy of + * the original blkg. If this is the case, then the + * bfqg_and_blkg_get performed in bfq_get_queue, to pin down + * the blkg, is useless: it does not prevent blk-cgroup code + * from destroying both the original blkg and all objects + * directly or indirectly referred by the copy of the + * blkg. + * + * On the bright side, destroy operations on a blkg invoke, as + * a first step, hooks of the scheduler associated with the + * blkg. And these hooks are executed with bfqd->lock held for + * BFQ. As a consequence, for any blkg associated with the + * request queue this instance of the scheduler is attached + * to, we are guaranteed that such a blkg is not destroyed, and + * that all the pointers it contains are consistent, while we + * are holding bfqd->lock. A blkg_lookup performed with + * bfqd->lock held then returns a fully consistent blkg, which + * remains consistent until this lock is held. + * + * Thanks to the last fact, and to the fact that: (1) bfqg has + * been obtained through a blkg_lookup in the above + * assignment, and (2) bfqd->lock is being held, here we can + * safely use the policy data for the involved blkg (i.e., the + * field bfqg->pd) to get to the blkg associated with bfqg, + * and then we can safely use any field of blkg. After we + * release bfqd->lock, even just getting blkg through this + * bfqg may cause dangling references to be traversed, as + * bfqg->pd may not exist any more. + * + * In view of the above facts, here we cache, in the bfqg, any + * blkg data we may need for this bic, and for its associated + * bfq_queue. As of now, we need to cache only the path of the + * blkg, which is used in the bfq_log_* functions. + * + * Finally, note that bfqg itself needs to be protected from + * destruction on the blkg_free of the original blkg (which + * invokes bfq_pd_free). We use an additional private + * refcounter for bfqg, to let it disappear only after no + * bfq_queue refers to it any longer. + */ + blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); +#endif bic->blkcg_serial_nr = serial_nr; out: rcu_read_unlock(); @@ -682,8 +809,6 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. * @st: the service tree with the entities. - * - * Needs queue_lock to be taken and reference to be valid over the call. */ static void bfq_reparent_active_entities(struct bfq_data *bfqd, struct bfq_group *bfqg, @@ -736,6 +861,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) #ifdef BFQ_MQ spin_lock_irqsave(&bfqd->lock, flags); #endif + /* * Empty all service_trees belonging to this group before * deactivating the group itself. @@ -746,8 +872,7 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) /* * The idle tree may still contain bfq_queues belonging * to exited task because they never migrated to a different - * cgroup from the one being destroyed now. No one else - * can access them so it's safe to act without any lock. + * cgroup from the one being destroyed now. */ bfq_flush_idle_tree(st); diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index a9e3406fef06..4eb668eeacdc 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4073,7 +4073,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) kmem_cache_free(bfq_pool, bfqq); #ifdef BFQ_GROUP_IOSCHED_ENABLED - bfqg_put(bfqg); + bfqg_and_blkg_put(bfqg); #endif } diff --git a/block/bfq-mq.h b/block/bfq-mq.h index 36ee24a87dda..77ab0f22ed22 100644 --- a/block/bfq-mq.h +++ b/block/bfq-mq.h @@ -695,23 +695,17 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ pr_crit("%s bfq%d%c %s " fmt "\n", \ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ (bfqq)->pid, \ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - __pbuf, ##args); \ + bfqq_group(bfqq)->blkg_path, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ pr_crit("%s %s " fmt "\n", \ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ - __pbuf, ##args); \ + bfqg->blkg_path, ##args); \ } while (0) #else /* BFQ_GROUP_IOSCHED_ENABLED */ @@ -736,20 +730,14 @@ static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ (bfqq)->pid, \ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - __pbuf, ##args); \ + bfqq_group(bfqq)->blkg_path, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ - char __pbuf[128]; \ - \ - blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ - blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, bfqg->blkg_path, ##args);\ } while (0) #else /* BFQ_GROUP_IOSCHED_ENABLED */ @@ -860,6 +848,12 @@ struct bfq_group { /* must be the first member */ struct blkg_policy_data pd; + /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ + char blkg_path[128]; + + /* reference counter (see comments in bfq_bic_update_cgroup) */ + int ref; + struct bfq_entity entity; struct bfq_sched_data sched_data; From c9137b749aceef6c2dde88e99b2fc978d5952e76 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 17 Jun 2017 11:18:11 +0200 Subject: [PATCH 28/51] bfq-mq: fix macro name in conditional invocation of policy_unregister This commit fixes the name of the macro in the conditional group that invokes blkcg_policy_unregister in bfq_exit for bfq-mq. Because of this error, blkcg_policy_unregister was never invoked. Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 4eb668eeacdc..bc1de3f70ea8 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -5669,7 +5669,7 @@ static int __init bfq_init(void) static void __exit bfq_exit(void) { elv_unregister(&iosched_bfq_mq); -#ifdef CONFIG_BFQ_GROUP_ENABLED +#ifdef BFQ_GROUP_IOSCHED_ENABLED blkcg_policy_unregister(&blkcg_policy_bfq); #endif bfq_slab_kill(); From c7ceb37496f63b2dba4d06946ab85ec97b87bfb5 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 5 Jul 2017 11:48:17 +0200 Subject: [PATCH 29/51] Port of "blk-mq-sched: unify request finished methods" No need to have two different callouts of bfq vs kyber. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bfq-mq-iosched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index bc1de3f70ea8..2598602a0b10 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4753,7 +4753,7 @@ static void bfq_put_rq_priv_body(struct bfq_queue *bfqq) bfq_put_queue(bfqq); } -static void bfq_put_rq_private(struct request_queue *q, struct request *rq) +static void bfq_finish_request(struct request *rq) { struct bfq_queue *bfqq; struct bfq_data *bfqd; @@ -4814,7 +4814,7 @@ static void bfq_put_rq_private(struct request_queue *q, struct request *rq) assert_spin_locked(&bfqd->lock); if (!RB_EMPTY_NODE(&rq->rb_node)) - bfq_remove_request(q, rq); + bfq_remove_request(rq->q, rq); bfq_put_rq_priv_body(bfqq); } @@ -5558,7 +5558,7 @@ static struct elv_fs_entry bfq_attrs[] = { static struct elevator_type iosched_bfq_mq = { .ops.mq = { .get_rq_priv = bfq_get_rq_private, - .put_rq_priv = bfq_put_rq_private, + .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, From 12bef026fe114ab5e2e284772ddc52a8be83fdbc Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 5 Jul 2017 11:54:57 +0200 Subject: [PATCH 30/51] Port of "bfq-iosched: fix NULL ioc check in bfq_get_rq_private" icq_to_bic is a container_of operation, so we need to check for NULL before it. Also move the check outside the spinlock while we're at it. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bfq-mq-iosched.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 2598602a0b10..c57774a60911 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4903,16 +4903,17 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, struct bio *bio) { struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); + struct bfq_io_cq *bic; const int is_sync = rq_is_sync(rq); struct bfq_queue *bfqq; bool bfqq_already_existing = false, split = false; bool new_queue = false; - spin_lock_irq(&bfqd->lock); + if (!rq->elv.icq) + return 1; + bic = icq_to_bic(rq->elv.icq); - if (!bic) - goto queue_fail; + spin_lock_irq(&bfqd->lock); bfq_check_ioprio_change(bic, bio); @@ -4980,13 +4981,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, rq->rq_flags |= RQF_GOT; spin_unlock_irq(&bfqd->lock); - return 0; - -queue_fail: - spin_unlock_irq(&bfqd->lock); - - return 1; } static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) From 633e5711347df1bf4ca935fd0aa9118a0054f75d Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 5 Jul 2017 12:02:16 +0200 Subject: [PATCH 31/51] Port of "blk-mq-sched: unify request prepare methods" This patch makes sure we always allocate requests in the core blk-mq code and use a common prepare_request method to initialize them for both mq I/O schedulers. For Kyber and additional limit_depth method is added that is called before allocating the request. Also because none of the intializations can really fail the new method does not return an error - instead the bfq finish method is hardened to deal with the no-IOC case. Last but not least this removes the abuse of RQF_QUEUE by the blk-mq scheduling code as RQF_ELFPRIV is all that is needed now. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/bfq-mq-iosched.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index c57774a60911..49ffca1ad6e7 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4760,6 +4760,10 @@ static void bfq_finish_request(struct request *rq) struct bfq_io_cq *bic; BUG_ON(!rq); + + if (!rq->elv.icq) + return; + bfqq = RQ_BFQQ(rq); BUG_ON(!bfqq); @@ -4899,9 +4903,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, /* * Allocate bfq data structures associated with this request. */ -static int bfq_get_rq_private(struct request_queue *q, struct request *rq, - struct bio *bio) +static void bfq_prepare_request(struct request *rq, struct bio *bio) { + struct request_queue *q = rq->q; struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_io_cq *bic; const int is_sync = rq_is_sync(rq); @@ -4910,7 +4914,7 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, bool new_queue = false; if (!rq->elv.icq) - return 1; + return; bic = icq_to_bic(rq->elv.icq); spin_lock_irq(&bfqd->lock); @@ -4981,7 +4985,6 @@ static int bfq_get_rq_private(struct request_queue *q, struct request *rq, rq->rq_flags |= RQF_GOT; spin_unlock_irq(&bfqd->lock); - return 0; } static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) @@ -5552,7 +5555,7 @@ static struct elv_fs_entry bfq_attrs[] = { static struct elevator_type iosched_bfq_mq = { .ops.mq = { - .get_rq_priv = bfq_get_rq_private, + .prepare_request = bfq_prepare_request, .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, From 5a321acfce282c3e58ac63582faf6f928ad17f27 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 5 Jul 2017 12:43:22 +0200 Subject: [PATCH 32/51] Add list of bfq instances to documentation Signed-off-by: Paolo Valente --- Documentation/block/bfq-iosched.txt | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 3d6951d63489..8ce6b9a9bacd 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -11,6 +11,15 @@ controllers), BFQ's main features are: groups (switching back to time distribution when needed to keep throughput high). +If bfq-mq patches have been applied, then the following three +instances of BFQ are available (otherwise only the first instance): +- bfq: mainline version of BFQ, for blk-mq +- bfq-mq: development version of BFQ for blk-mq; this version contains + also all latest features not yet landed in mainline, plus many + safety checks +- bfq: BFQ for legacy blk; also this version contains both latest + features and safety checks + In its default configuration, BFQ privileges latency over throughput. So, when needed for achieving a lower latency, BFQ builds schedules that may lead to a lower throughput. If your main or only @@ -27,7 +36,7 @@ sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on multi-queue devices too. -The table of contents follow. Impatients can just jump to Section 3. +The table of contents follows. Impatients can just jump to Section 3. CONTENTS From 9f2e5b27227fd9254cc258572dc2d4531838c30b Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 5 Jul 2017 16:28:00 +0200 Subject: [PATCH 33/51] bfq-sq: fix prefix of names of cgroups parameters Signed-off-by: Paolo Valente --- Documentation/block/bfq-iosched.txt | 12 +++++++----- block/bfq-cgroup-included.c | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 8ce6b9a9bacd..965d82f94db9 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -503,10 +503,12 @@ To get proportional sharing of bandwidth with BFQ for a given device, BFQ must of course be the active scheduler for that device. Within each group directory, the names of the files associated with -BFQ-specific cgroup parameters and stats begin with the "bfq." -prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for -BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group -parameter to set the weight of a group with BFQ is blkio.bfq.weight +BFQ-specific cgroup parameters and stats begin with the "bfq.", +"bfq-sq." or "bfq-mq." prefix, depending on which instance of bfq you +want to use. So, with cgroups-v1 or cgroups-v2, the full prefix for +BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" +(i.e., null string), "-sq" or "-mq". For example, the group parameter +to set the weight of a group with the mainline BFQ is blkio.bfq.weight or io.bfq.weight. Parameters to set @@ -514,7 +516,7 @@ Parameters to set For each group, there is only the following parameter to set. -weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the +weight (namely blkio.bfqX.weight or io.bfqX.weight): the weight of the group inside its parent. Available values: 1..10000 (default 100). The linear mapping between ioprio and weights, described at the beginning of the tunable section, is still valid, but all weights higher than diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c index d903393ee78a..631e53d9150d 100644 --- a/block/bfq-cgroup-included.c +++ b/block/bfq-cgroup-included.c @@ -1124,7 +1124,7 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) #ifdef BFQ_MQ #define BFQ_CGROUP_FNAME(param) "bfq-mq."#param #else -#define BFQ_CGROUP_FNAME(param) "bfq."#param +#define BFQ_CGROUP_FNAME(param) "bfq-sq."#param #endif static struct cftype bfq_blkcg_legacy_files[] = { From 92b42df8166939ccf26aa450125b5b575cf6d505 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 5 Jul 2017 21:08:32 +0200 Subject: [PATCH 34/51] Add to documentation that bfq-mq and bfq-sq contain last fixes too Signed-off-by: Paolo Valente --- Documentation/block/bfq-iosched.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 965d82f94db9..0e59f1c9d30e 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -15,10 +15,10 @@ If bfq-mq patches have been applied, then the following three instances of BFQ are available (otherwise only the first instance): - bfq: mainline version of BFQ, for blk-mq - bfq-mq: development version of BFQ for blk-mq; this version contains - also all latest features not yet landed in mainline, plus many + also all latest features and fixes not yet landed in mainline, plus many safety checks -- bfq: BFQ for legacy blk; also this version contains both latest - features and safety checks +- bfq: BFQ for legacy blk; also this version contains latest features + and fixes, as well as safety checks In its default configuration, BFQ privileges latency over throughput. So, when needed for achieving a lower latency, BFQ builds From 7f9bdd433b848d4f53c167258bf4d0b3f1ae1923 Mon Sep 17 00:00:00 2001 From: Lee Tibbert Date: Wed, 19 Jul 2017 10:28:32 -0400 Subject: [PATCH 35/51] Improve most frequently used no-logging path This patch originated as a fix for compiler unused-variable warnings issued when compiling bfq-mq with logging disabled (both CONFIG_BLK_DEV_IO_TRACE and CONFIG_BFQ_REDIRECT_TO_CONSOLE undefined). It turns out to also have benefits for the bfq-sq path as well. In most performance sensitive production builds blktrace_api logging will probably be turned off, so it is worth making the no-logging path compile without warnings. Any performance benefit is a bonus. Thank you to T. B. on the bfq-iosched@googlegroups.com list for ((void) (bfqq)) simplification/suggestion/improvement. All bugs and unclear descriptions are my own doing. The discussion below is based on the gcc compiler with optimization level of at least 02. Lower optimization levels are unlikely to remove no-op instruction equivalents. Provide three improvements in this likely case. 1) Fix multiple occurrences of an unused-variable warning issued when compiling bfq-mq with no logging. The warning occurred each time the bfq_log_bfqg macro was expanded inside a code block such as the following snippet from block/bfq-sched.c, line 139 and few following, lightly edited for indentation in order to pass checkpatch.pl maximum line lengths. else { struct bfq_group *bfqg = container_of(next_in_service, struct bfq_group, entity); bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, "update_next_in_service: chosen this entity"); } Previously bfq-mq.h expanded bfq_log_bfqg to blk_add_trace_msg. When both bfq console logging and blktrace_api logging are disabled, include/linux/blktrace_api expands to do { } while (0), leaving the code block local variable unused. bfq_log_bfqq() had similar behavior but is never called with a potentially unused variable. This patch fixes that macro for consistency. bfq-sq.h (single queue) with blktrace_api enabled, and the bfq console logging macros have code paths which not trigger this warning. kernel.org (4.12 & 4.13) bfq (bfq-iosched.h) could trigger the warning but no code does so now. This patch fixes bfq-iosched.h for consistency. The style above enables a software engineering approach where complex expressions are moved to a local variable before the bfq_log* call. This makes it easier to read the expression and use breakpoints to verify it. bfq-mq uses this approach in several places. New bfq_log* macros are provided for the no-logging case. I touch only the second argument, because current code never uses the local variable approach with the first or other arguments. I tried to balance consistency with simplicity. 2) For bfq-sq, reduce to zero, the number of instructions executed when no logging is configured. No sense marshaling arguments which are never going to be used. On a trial V8R11 builds, this reduced the size of bfq-iosched.o by 14.3 KiB. The size went from 70304 to 55664 bytes. bfq-mq and kernel.org bfq code size does not change because existing macros already optimize to zero bytes when not logging. The current changes maintains consistency with the bfq-sq path and makes the bfq-mq & bfq no-logging paths resistant to future logging path macro changes which might cause generated code. 3) Slightly reduce compile time of all bfq variants by including blktrace_api.h only when it will be used. Signed-off-by: Lee Tibbert --- block/bfq-mq.h | 18 +++++++++++++++++- block/bfq.h | 18 +++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/block/bfq-mq.h b/block/bfq-mq.h index 77ab0f22ed22..7ed2cc29be57 100644 --- a/block/bfq-mq.h +++ b/block/bfq-mq.h @@ -15,7 +15,6 @@ #ifndef _BFQ_H #define _BFQ_H -#include #include #include @@ -725,6 +724,21 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ##args) #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ + +#if !defined(CONFIG_BLK_DEV_IO_TRACE) + +/* Avoid possible "unused-variable" warning. See commit message. */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) + +#define bfq_log(bfqd, fmt, args...) do {} while (0) + +#else /* CONFIG_BLK_DEV_IO_TRACE */ + +#include + #ifdef BFQ_GROUP_IOSCHED_ENABLED static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); @@ -752,6 +766,8 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); #define bfq_log(bfqd, fmt, args...) \ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) + +#endif /* CONFIG_BLK_DEV_IO_TRACE */ #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ /* Expiration reasons. */ diff --git a/block/bfq.h b/block/bfq.h index 53954d1b87f8..15d326f466b7 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -15,7 +15,6 @@ #ifndef _BFQ_H #define _BFQ_H -#include #include #include @@ -725,6 +724,21 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ##args) #else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ + +#if !defined(CONFIG_BLK_DEV_IO_TRACE) + +/* Avoid possible "unused-variable" warning. See commit message. */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) + +#define bfq_log(bfqd, fmt, args...) do {} while (0) + +#else /* CONFIG_BLK_DEV_IO_TRACE */ + +#include + #ifdef BFQ_GROUP_IOSCHED_ENABLED static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); @@ -759,6 +773,8 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); #define bfq_log(bfqd, fmt, args...) \ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) + +#endif /* CONFIG_BLK_DEV_IO_TRACE */ #endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ /* Expiration reasons. */ From f11a0e751e741bf94c6a48234824d50b3c0100ad Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 9 Aug 2017 16:40:39 +0200 Subject: [PATCH 36/51] bfq-sq: fix commit "Remove all get and put of I/O contexts" in branch bfq-mq The commit "Remove all get and put of I/O contexts" erroneously removed the reset of the field in_service_bic for bfq-sq. This commit re-adds that missing reset. Signed-off-by: Paolo Valente --- block/bfq-sched.c | 7 +++++++ block/bfq-sq-iosched.c | 1 + 2 files changed, 8 insertions(+) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 9c4e6797d8c9..7425824c26b8 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -1904,6 +1904,13 @@ static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; struct bfq_entity *entity = in_serv_entity; +#ifndef BFQ_MQ + if (bfqd->in_service_bic) { + put_io_context(bfqd->in_service_bic->icq.ioc); + bfqd->in_service_bic = NULL; + } +#endif + bfq_clear_bfqq_wait_request(in_serv_bfqq); hrtimer_try_to_cancel(&bfqd->idle_slice_timer); bfqd->in_service_queue = NULL; diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c index 25da0d1c0622..e1960bf149d8 100644 --- a/block/bfq-sq-iosched.c +++ b/block/bfq-sq-iosched.c @@ -3765,6 +3765,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, if (!bfqd->in_service_bic) { atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); bfqd->in_service_bic = RQ_BIC(rq); + BUG_ON(!bfqd->in_service_bic); } if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) From eceae5457530df8598557767d7be258ca9384de4 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 9 Aug 2017 22:29:01 +0200 Subject: [PATCH 37/51] bfq-sq-mq: make lookup_next_entity push up vtime on expirations To provide a very smooth service, bfq starts to serve a bfq_queue only if the queue is 'eligible', i.e., if the same queue would have started to be served in the ideal, perfectly fair system that bfq simulates internally. This is obtained by associating each queue with a virtual start time, and by computing a special system virtual time quantity: a queue is eligible only if the system virtual time has reached the virtual start time of the queue. Finally, bfq guarantees that, when a new queue must be set in service, there is always at least one eligible entity for each active parent entity in the scheduler. To provide this guarantee, the function __bfq_lookup_next_entity pushes up, for each parent entity on which it is invoked, the system virtual time to the minimum among the virtual start times of the entities in the active tree for the parent entity (more precisely, the push up occurs if the system virtual time happens to be lower than all such virtual start times). There is however a circumstance in which __bfq_lookup_next_entity cannot push up the system virtual time for a parent entity, even if the system virtual time is lower than the virtual start times of all the child entities in the active tree. It happens if one of the child entities is in service. In fact, in such a case, there is already an eligible entity, the in-service one, even if it may not be not present in the active tree (because in-service entities may be removed from the active tree). Unfortunately, in the last re-design of the hierarchical-scheduling engine, the reset of the pointer to the in-service entity for a given parent entity--reset to be done as a consequence of the expiration of the in-service entity--always happens after the function __bfq_lookup_next_entity has been invoked. This causes the function to think that there is still an entity in service for the parent entity, and then that the system virtual time cannot be pushed up, even if actually such a no-more-in-service entity has already been properly reinserted into the active tree (or in some other tree if no more active). Yet, the system virtual time *had* to be pushed up, to be ready to correctly choose the next queue to serve. Because of the lack of this push up, bfq may wrongly set in service a queue that had been speculatively pre-computed as the possible next-in-service queue, but that would no more be the one to serve after the expiration and the reinsertion into the active trees of the previously in-service entities. This commit addresses this issue by making __bfq_lookup_next_entity properly push up the system virtual time if an expiration is occurring. Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 4 +-- block/bfq-sched.c | 77 ++++++++++++++++++++++++++++++++------------------ block/bfq-sq-iosched.c | 4 +-- 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 49ffca1ad6e7..b5c848650375 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -682,7 +682,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, entity->budget = new_budget; bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); - bfq_requeue_bfqq(bfqd, bfqq); + bfq_requeue_bfqq(bfqd, bfqq, false); } } @@ -2822,7 +2822,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_del_bfqq_busy(bfqd, bfqq, true); } else { - bfq_requeue_bfqq(bfqd, bfqq); + bfq_requeue_bfqq(bfqd, bfqq, true); /* * Resort priority tree of potential close cooperators. */ diff --git a/block/bfq-sched.c b/block/bfq-sched.c index 7425824c26b8..f3001af37256 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -33,7 +33,8 @@ static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) return rb_entry(node, struct bfq_entity, rb_node); } -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + bool expiration); static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); @@ -43,6 +44,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); * @new_entity: if not NULL, pointer to the entity whose activation, * requeueing or repositionig triggered the invocation of * this function. + * @expiration: id true, this function is being invoked after the + * expiration of the in-service entity * * This function is called to update sd->next_in_service, which, in * its turn, may change as a consequence of the insertion or @@ -61,7 +64,8 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); * entity. */ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, - struct bfq_entity *new_entity) + struct bfq_entity *new_entity, + bool expiration) { struct bfq_entity *next_in_service = sd->next_in_service; struct bfq_queue *bfqq; @@ -120,7 +124,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, if (replace_next) next_in_service = new_entity; } else /* invoked because of a deactivation: lookup needed */ - next_in_service = bfq_lookup_next_entity(sd); + next_in_service = bfq_lookup_next_entity(sd, expiration); if (next_in_service) { parent_sched_may_change = !sd->next_in_service || @@ -1291,10 +1295,12 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, * @requeue: true if this is a requeue, which implies that bfqq is * being expired; thus ALL its ancestors stop being served and must * therefore be requeued + * @expiration: true if this function is being invoked in the expiration path + * of the in-service queue */ static void bfq_activate_requeue_entity(struct bfq_entity *entity, bool non_blocking_wait_rq, - bool requeue) + bool requeue, bool expiration) { struct bfq_sched_data *sd; @@ -1307,7 +1313,8 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); - if (!bfq_update_next_in_service(sd, entity) && !requeue) { + if (!bfq_update_next_in_service(sd, entity, expiration) && + !requeue) { BUG_ON(!sd->next_in_service); break; } @@ -1373,6 +1380,8 @@ static bool __bfq_deactivate_entity(struct bfq_entity *entity, * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. * @entity: the entity to deactivate. * @ins_into_idle_tree: true if the entity can be put into the idle tree + * @expiration: true if this function is being invoked in the expiration path + * of the in-service queue */ static void bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree, @@ -1417,7 +1426,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, * then, since entity has just been * deactivated, a new one must be found. */ - bfq_update_next_in_service(sd, NULL); + bfq_update_next_in_service(sd, NULL, expiration); if (sd->next_in_service || sd->in_service_entity) { /* @@ -1495,7 +1504,7 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, "invoking udpdate_next for this entity"); } #endif - if (!bfq_update_next_in_service(sd, entity) && + if (!bfq_update_next_in_service(sd, entity, expiration) && !expiration) /* * next_in_service unchanged or not causing @@ -1524,7 +1533,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) if (bfqq) bfq_log_bfqq(bfqq->bfqd, bfqq, "calc_vtime_jump: new value %llu", - root_entity->min_start); + ((root_entity->min_start>>10)*1000)>>12); #ifdef BFQ_GROUP_IOSCHED_ENABLED else { struct bfq_group *bfqg = @@ -1533,7 +1542,7 @@ static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, "calc_vtime_jump: new value %llu", - root_entity->min_start); + ((root_entity->min_start>>10)*1000)>>12); } #endif return root_entity->min_start; @@ -1615,17 +1624,9 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, * 3) is idle. */ static struct bfq_entity * -__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service -#if 0 - , bool force -#endif - ) +__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) { - struct bfq_entity *entity -#if 0 - , *new_next_in_service = NULL -#endif - ; + struct bfq_entity *entity; u64 new_vtime; struct bfq_queue *bfqq; @@ -1667,8 +1668,9 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service container_of(entity, struct bfq_group, entity); bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "__lookup_next: start %llu vtime %llu st %p", + "__lookup_next: start %llu vtime %llu (%llu) st %p", ((entity->start>>10)*1000)>>12, + ((st->vtime>>10)*1000)>>12, ((new_vtime>>10)*1000)>>12, st); } #endif @@ -1681,12 +1683,14 @@ __bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service /** * bfq_lookup_next_entity - return the first eligible entity in @sd. * @sd: the sched_data. + * @expiration: true if we are on the expiration path of the in-service queue * * This function is invoked when there has been a change in the trees - * for sd, and we need know what is the new next entity after this - * change. + * for sd, and we need to know what is the new next entity to serve + * after this change. */ -static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + bool expiration) { struct bfq_service_tree *st = sd->service_tree; struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); @@ -1716,8 +1720,24 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) * class, unless the idle class needs to be served. */ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { + /* + * If expiration is true, then bfq_lookup_next_entity + * is being invoked as a part of the expiration path + * of the in-service queue. In this case, even if + * sd->in_service_entity is not NULL, + * sd->in_service_entiy at this point is actually not + * in service any more, and, if needed, has already + * been properly queued or requeued into the right + * tree. The reason why sd->in_service_entity is still + * not NULL here, even if expiration is true, is that + * sd->in_service_entiy is reset as a last step in the + * expiration path. So, if expiration is true, tell + * __bfq_lookup_next_entity that there is no + * sd->in_service_entity. + */ entity = __bfq_lookup_next_entity(st + class_idx, - sd->in_service_entity); + sd->in_service_entity && + !expiration); if (entity) break; @@ -1891,7 +1911,7 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) for_each_entity(entity) { struct bfq_sched_data *sd = entity->sched_data; - if(!bfq_update_next_in_service(sd, NULL)) + if (!bfq_update_next_in_service(sd, NULL, false)) break; } @@ -1951,16 +1971,17 @@ static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) entity->on_st); bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), - false); + false, false); bfq_clear_bfqq_non_blocking_wait_rq(bfqq); } -static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool expiration) { struct bfq_entity *entity = &bfqq->entity; bfq_activate_requeue_entity(entity, false, - bfqq == bfqd->in_service_queue); + bfqq == bfqd->in_service_queue, expiration); } static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c index e1960bf149d8..42393ab889a9 100644 --- a/block/bfq-sq-iosched.c +++ b/block/bfq-sq-iosched.c @@ -644,7 +644,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, entity->budget = new_budget; bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); - bfq_requeue_bfqq(bfqd, bfqq); + bfq_requeue_bfqq(bfqd, bfqq, false); } } @@ -2715,7 +2715,7 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_del_bfqq_busy(bfqd, bfqq, true); } else { - bfq_requeue_bfqq(bfqd, bfqq); + bfq_requeue_bfqq(bfqd, bfqq, true); /* * Resort priority tree of potential close cooperators. */ From ee9f95b24e1d88ffba4845981c2a4684aefd0245 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 9 Aug 2017 22:53:00 +0200 Subject: [PATCH 38/51] bfq-sq-mq: remove direct switch to an entity in higher class If the function bfq_update_next_in_service is invoked as a consequence of the activation or requeueing of an entity, say E, and finds out that E belongs to a higher-priority class than that of the current next-in-service entity, then it sets next_in_service directly to E. But this may lead to anomalous schedules, because E may happen not be eligible for service, because its virtual start time is higher than the system virtual time for its service tree. This commit addresses this issue by simply removing this direct switch. Signed-off-by: Paolo Valente --- block/bfq-sched.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index f3001af37256..b1a59088db88 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -76,9 +76,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, * or repositiong of an entity that does not coincide with * sd->next_in_service, then a full lookup in the active tree * can be avoided. In fact, it is enough to check whether the - * just-modified entity has a higher priority than - * sd->next_in_service, or, even if it has the same priority - * as sd->next_in_service, is eligible and has a lower virtual + * just-modified entity has the same priority as + * sd->next_in_service, is eligible and has a lower virtual * finish time than sd->next_in_service. If this compound * condition holds, then the new entity becomes the new * next_in_service. Otherwise no change is needed. @@ -94,9 +93,8 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, /* * If there is already a next_in_service candidate - * entity, then compare class priorities or timestamps - * to decide whether to replace sd->service_tree with - * new_entity. + * entity, then compare timestamps to decide whether + * to replace sd->service_tree with new_entity. */ if (next_in_service) { unsigned int new_entity_class_idx = @@ -104,10 +102,6 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, struct bfq_service_tree *st = sd->service_tree + new_entity_class_idx; - /* - * For efficiency, evaluate the most likely - * sub-condition first. - */ replace_next = (new_entity_class_idx == bfq_class_idx(next_in_service) @@ -115,10 +109,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, !bfq_gt(new_entity->start, st->vtime) && bfq_gt(next_in_service->finish, - new_entity->finish)) - || - new_entity_class_idx < - bfq_class_idx(next_in_service); + new_entity->finish)); } if (replace_next) From a3fdc5af40537355b68c1f0d3997c5a5fb54b9ce Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 10 Aug 2017 08:15:50 +0200 Subject: [PATCH 39/51] bfq-sq-mq: guarantee update_next_in_service always returns an eligible entity If the function bfq_update_next_in_service is invoked as a consequence of the activation or requeueing of an entity, say E, then it doesn't invoke bfq_lookup_next_entity to get the next-in-service entity. In contrast, it follows a shorter path: if E happens to be eligible (see commit "bfq-sq-mq: make lookup_next_entity push up vtime on expirations" for details on eligibility) and to have a lower virtual finish time than the current candidate as next-in-service entity, then E directly becomes the next-in-service entity. Unfortunately, there is a corner case for which this shorter path makes bfq_update_next_in_service choose a non eligible entity: it occurs if both E and the current next-in-service entity happen to be non eligible when bfq_update_next_in_service is invoked. In this case, E is not set as next-in-service, and, since bfq_lookup_next_entity is not invoked, the state of the parent entity is not updated so as to end up with an eligible entity as the proper next-in-service entity. In this respect, next-in-service is actually allowed to be non eligible while some queue is in service: since no system-virtual-time push-up can be performed in that case (see again commit "bfq-sq-mq: make lookup_next_entity push up vtime on expirations" for details), next-in-service is chosen, speculatively, as a function of the possible value that the system virtual time may get after a push up. But the correctness of the schedule breaks if next-in-service is still a non eligible entity when it is time to set in service the next entity. Unfortunately, this may happen in the above corner case. This commit fixes this problem by making bfq_update_next_in_service invoke bfq_lookup_next_entity not only if the above shorter path cannot be taken, but also if the shorter path is taken but fails to yield an eligible next-in-service entity. Signed-off-by: Paolo Valente --- block/bfq-sched.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/block/bfq-sched.c b/block/bfq-sched.c index b1a59088db88..e4a2553a2d2c 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -70,6 +70,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, struct bfq_entity *next_in_service = sd->next_in_service; struct bfq_queue *bfqq; bool parent_sched_may_change = false; + bool change_without_lookup = false; /* * If this update is triggered by the activation, requeueing @@ -89,7 +90,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, * set to true, and left as true if * sd->next_in_service is NULL. */ - bool replace_next = true; + change_without_lookup = true; /* * If there is already a next_in_service candidate @@ -102,7 +103,7 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, struct bfq_service_tree *st = sd->service_tree + new_entity_class_idx; - replace_next = + change_without_lookup = (new_entity_class_idx == bfq_class_idx(next_in_service) && @@ -112,15 +113,32 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, new_entity->finish)); } - if (replace_next) + if (change_without_lookup) { next_in_service = new_entity; - } else /* invoked because of a deactivation: lookup needed */ + bfqq = bfq_entity_to_bfqq(next_in_service); + + if (bfqq) + bfq_log_bfqq(bfqq->bfqd, bfqq, + "update_next_in_service: chose without lookup"); +#ifdef BFQ_GROUP_IOSCHED_ENABLED + else { + struct bfq_group *bfqg = + container_of(next_in_service, + struct bfq_group, entity); + + bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, + "update_next_in_service: chose without lookup"); + } +#endif + } + } + + if (!change_without_lookup) /* lookup needed */ next_in_service = bfq_lookup_next_entity(sd, expiration); - if (next_in_service) { + if (next_in_service) parent_sched_may_change = !sd->next_in_service || bfq_update_parent_budget(next_in_service); - } sd->next_in_service = next_in_service; @@ -1053,7 +1071,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, if (bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: new queue finish %llu", + "update_fin_time_enqueue: new queue finish %llu", ((entity->finish>>10)*1000)>>12); #ifdef BFQ_GROUP_IOSCHED_ENABLED } else { @@ -1061,7 +1079,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, container_of(entity, struct bfq_group, entity); bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "__activate_entity: new group finish %llu", + "update_fin_time_enqueue: new group finish %llu", ((entity->finish>>10)*1000)>>12); #endif } @@ -1071,7 +1089,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, if (bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, - "__activate_entity: queue %seligible in st %p", + "update_fin_time_enqueue: queue %seligible in st %p", entity->start <= st->vtime ? "" : "non ", st); #ifdef BFQ_GROUP_IOSCHED_ENABLED } else { @@ -1079,7 +1097,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, container_of(entity, struct bfq_group, entity); bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, - "__activate_entity: group %seligible in st %p", + "update_fin_time_enqueue: group %seligible in st %p", entity->start <= st->vtime ? "" : "non ", st); #endif } From 6565e4d1aac029b6f0a5d86a4c6ef38608838eac Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 31 Aug 2017 19:24:26 +0200 Subject: [PATCH 40/51] doc, block, bfq: fix some typos and stale sentences Signed-off-by: Paolo Valente Reviewed-by: Jeremy Hickman Reviewed-by: Laurentiu Nicola --- Documentation/block/bfq-iosched.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 0e59f1c9d30e..dcfe15523da3 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -17,7 +17,7 @@ instances of BFQ are available (otherwise only the first instance): - bfq-mq: development version of BFQ for blk-mq; this version contains also all latest features and fixes not yet landed in mainline, plus many safety checks -- bfq: BFQ for legacy blk; also this version contains latest features +- bfq-sq: BFQ for legacy blk; also this version contains latest features and fixes, as well as safety checks In its default configuration, BFQ privileges latency over From 261ee8cc9f43e03d790a07184f0bcaa504ee6737 Mon Sep 17 00:00:00 2001 From: Luca Miccio Date: Wed, 13 Sep 2017 12:03:56 +0200 Subject: [PATCH 41/51] bfq-mq, bfq-sq: Disable writeback throttling Similarly to CFQ, BFQ has its write-throttling heuristics, and it is better not to combine them with further write-throttling heuristics of a different nature. So this commit disables write-back throttling for a device if BFQ is used as I/O scheduler for that device. Signed-off-by: Luca Miccio Signed-off-by: Paolo Valente Tested-by: Oleksandr Natalenko --- block/bfq-mq-iosched.c | 2 ++ block/bfq-sq-iosched.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index b5c848650375..7d27d5b3befb 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -89,6 +89,7 @@ #include "blk-mq-tag.h" #include "blk-mq-sched.h" #include "bfq-mq.h" +#include "blk-wbt.h" /* Expiration time of sync (0) and async (1) requests, in ns. */ static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; @@ -5260,6 +5261,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + wbt_disable_default(q); return 0; out_free: diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c index 42393ab889a9..6fdc3b1d5bb8 100644 --- a/block/bfq-sq-iosched.c +++ b/block/bfq-sq-iosched.c @@ -83,6 +83,7 @@ #include #include "blk.h" #include "bfq.h" +#include "blk-wbt.h" /* Expiration time of sync (0) and async (1) requests, in ns. */ static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; @@ -4976,6 +4977,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) return -ENOMEM; } +static void bfq_registered_queue(struct request_queue *q) +{ + wbt_disable_default(q); +} + static void bfq_slab_kill(void) { kmem_cache_destroy(bfq_pool); @@ -5285,6 +5291,7 @@ static struct elevator_type iosched_bfq = { .elevator_may_queue_fn = bfq_may_queue, .elevator_init_fn = bfq_init_queue, .elevator_exit_fn = bfq_exit_queue, + .elevator_registered_fn = bfq_registered_queue, }, .icq_size = sizeof(struct bfq_io_cq), .icq_align = __alignof__(struct bfq_io_cq), From 40ea0aed088791da27fcfa51f3b64d1f96b0d06e Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Tue, 12 Sep 2017 16:45:53 +0200 Subject: [PATCH 42/51] bfq-mq, bfq-sq: fix wrong init of saved start time for weight raising This commit fixes a bug that causes bfq to fail to guarantee a high responsiveness on some drives, if there is heavy random read+write I/O in the background. More precisely, such a failure allowed this bug to be found [1], but the bug may well cause other yet unreported anomalies. BFQ raises the weight of the bfq_queues associated with soft real-time applications, to privilege the I/O, and thus reduce latency, for these applications. This mechanism is named soft-real-time weight raising in BFQ. A soft real-time period may happen to be nested into an interactive weight raising period, i.e., it may happen that, when a bfq_queue switches to a soft real-time weight-raised state, the bfq_queue is already being weight-raised because deemed interactive too. In this case, BFQ saves in a special variable wr_start_at_switch_to_srt, the time instant when the interactive weight-raising period started for the bfq_queue, i.e., the time instant when BFQ started to deem the bfq_queue interactive. This value is then used to check whether the interactive weight-raising period would still be in progress when the soft real-time weight-raising period ends. If so, interactive weight raising is restored for the bfq_queue. This restore is useful, in particular, because it prevents bfq_queues from losing their interactive weight raising prematurely, as a consequence of spurious, short-lived soft real-time weight-raising periods caused by wrong detections as soft real-time. If, instead, a bfq_queue switches to soft-real-time weight raising while it *is not* already in an interactive weight-raising period, then the variable wr_start_at_switch_to_srt has no meaning during the following soft real-time weight-raising period. Unfortunately the handling of this case is wrong in BFQ: not only the variable is not flagged somehow as meaningless, but it is also set to the time when the switch to soft real-time weight-raising occurs. This may cause an interactive weight-raising period to be considered mistakenly as still in progress, and thus a spurious interactive weight-raising period to start for the bfq_queue, at the end of the soft-real-time weight-raising period. In particular the spurious interactive weight-raising period will be considered as still in progress, if the soft-real-time weight-raising period does not last very long. The bfq_queue will then be wrongly privileged and, if I/O bound, will unjustly steal bandwidth to truly interactive or soft real-time bfq_queues, harming responsiveness and low latency. This commit fixes this issue by just setting wr_start_at_switch_to_srt to minus infinity (farthest past time instant according to jiffies macros): when the soft-real-time weight-raising period ends, certainly no interactive weight-raising period will be considered as still in progress. [1] Background I/O Type: Random - Background I/O mix: Reads and writes - Application to start: LibreOffice Writer in http://www.phoronix.com/scan.php?page=news_item&px=Linux-4.13-IO-Laptop Signed-off-by: Paolo Valente Signed-off-by: Angelo Ruocco Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert Tested-by: Mirko Montanari --- block/bfq-mq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- block/bfq-sq-iosched.c | 50 +++++++++++++++++++++++++++++++------------------- 2 files changed, 62 insertions(+), 38 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 7d27d5b3befb..f378519b6d33 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -1204,6 +1204,24 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, return wr_or_deserves_wr; } +/* + * Return the farthest future time instant according to jiffies + * macros. + */ +static unsigned long bfq_greatest_from_now(void) +{ + return jiffies + MAX_JIFFY_OFFSET; +} + +/* + * Return the farthest past time instant according to jiffies + * macros. + */ +static unsigned long bfq_smallest_from_now(void) +{ + return jiffies - MAX_JIFFY_OFFSET; +} + static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, struct bfq_queue *bfqq, unsigned int old_wr_coeff, @@ -1218,7 +1236,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { - bfqq->wr_start_at_switch_to_srt = jiffies; + /* + * No interactive weight raising in progress + * here: assign minus infinity to + * wr_start_at_switch_to_srt, to make sure + * that, at the end of the soft-real-time + * weight raising periods that is starting + * now, no interactive weight-raising period + * may be wrongly considered as still in + * progress (and thus actually started by + * mistake). + */ + bfqq->wr_start_at_switch_to_srt = + bfq_smallest_from_now(); bfqq->wr_coeff = bfqd->bfq_wr_coeff * BFQ_SOFTRT_WEIGHT_FACTOR; bfqq->wr_cur_max_time = @@ -3174,24 +3204,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } -/* - * Return the farthest future time instant according to jiffies - * macros. - */ -static unsigned long bfq_greatest_from_now(void) -{ - return jiffies + MAX_JIFFY_OFFSET; -} - -/* - * Return the farthest past time instant according to jiffies - * macros. - */ -static unsigned long bfq_smallest_from_now(void) -{ - return jiffies - MAX_JIFFY_OFFSET; -} - /** * bfq_bfqq_expire - expire a queue. * @bfqd: device owning the queue. diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c index 6fdc3b1d5bb8..f4654436cd55 100644 --- a/block/bfq-sq-iosched.c +++ b/block/bfq-sq-iosched.c @@ -1165,6 +1165,24 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, return wr_or_deserves_wr; } +/* + * Return the farthest future time instant according to jiffies + * macros. + */ +static unsigned long bfq_greatest_from_now(void) +{ + return jiffies + MAX_JIFFY_OFFSET; +} + +/* + * Return the farthest past time instant according to jiffies + * macros. + */ +static unsigned long bfq_smallest_from_now(void) +{ + return jiffies - MAX_JIFFY_OFFSET; +} + static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, struct bfq_queue *bfqq, unsigned int old_wr_coeff, @@ -1179,7 +1197,19 @@ static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, bfqq->wr_coeff = bfqd->bfq_wr_coeff; bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); } else { - bfqq->wr_start_at_switch_to_srt = jiffies; + /* + * No interactive weight raising in progress + * here: assign minus infinity to + * wr_start_at_switch_to_srt, to make sure + * that, at the end of the soft-real-time + * weight raising periods that is starting + * now, no interactive weight-raising period + * may be wrongly considered as still in + * progress (and thus actually started by + * mistake). + */ + bfqq->wr_start_at_switch_to_srt = + bfq_smallest_from_now(); bfqq->wr_coeff = bfqd->bfq_wr_coeff * BFQ_SOFTRT_WEIGHT_FACTOR; bfqq->wr_cur_max_time = @@ -3067,24 +3097,6 @@ static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); } -/* - * Return the farthest future time instant according to jiffies - * macros. - */ -static unsigned long bfq_greatest_from_now(void) -{ - return jiffies + MAX_JIFFY_OFFSET; -} - -/* - * Return the farthest past time instant according to jiffies - * macros. - */ -static unsigned long bfq_smallest_from_now(void) -{ - return jiffies - MAX_JIFFY_OFFSET; -} - /** * bfq_bfqq_expire - expire a queue. * @bfqd: device owning the queue. From 9dbea44b6f721baeff35b9fdf628ec55fe00e09d Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Thu, 14 Sep 2017 05:12:58 -0400 Subject: [PATCH 43/51] Fix commit "Unnest request-queue and ioc locks from scheduler locks" The commit "Unnest request-queue and ioc locks from scheduler locks" mistakenly removed the setting of the split flag in function bfq_prepare_request. This commit puts this missing instruction back in its place. Signed-off-by: Paolo Valente --- block/bfq-mq-iosched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index f378519b6d33..288078e68a2a 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -744,6 +744,12 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", + __func__, + bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, + bfqq->wr_cur_max_time); + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time))) { @@ -2208,6 +2214,11 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", + __func__, + bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, + bfqq->wr_cur_max_time); } static void @@ -4950,6 +4961,7 @@ static void bfq_prepare_request(struct request *rq, struct bio *bio) bic->saved_in_large_burst = true; bfqq = bfq_split_bfqq(bic, bfqq); + split = true; if (!bfqq) bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, From d4ebb2a66a23dc183792088c521f2be2193b56db Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 15 Sep 2017 01:53:51 -0400 Subject: [PATCH 44/51] bfq-sq, bfq-mq: check and switch back to interactive wr also on queue split As already explained in the message of commit "bfq-mq, bfq-sq: fix wrong init of saved start time for weight raising", if a soft real-time weight-raising period happens to be nested in a larger interactive weight-raising period, then BFQ restores the interactive weight raising at the end of the soft real-time weight raising. In particular, BFQ checks whether the latter has ended only on request dispatches. Unfortunately, the above scheme fails to restore interactive weight raising in the following corner case: if a bfq_queue, say Q, 1) Is merged with another bfq_queue while it is in a nested soft real-time weight-raising period. The weight-raising state of Q is then saved, and not considered any longer until a split occurs. 2) Is split from the other bfq_queue(s) at a time instant when its soft real-time weight raising is already finished. On the split, while resuming the previous, soft real-time weight-raised state of the bfq_queue Q, BFQ checks whether the current soft real-time weight-raising period is actually over. If so, BFQ switches weight raising off for Q, *without* checking whether the soft real-time period was actually nested in a non-yet-finished interactive weight-raising period. This commit addresses this issue by adding the above missing check in bfq_queue splits, and restoring interactive weight raising if needed. Signed-off-by: Paolo Valente Tested-by: Angelo Ruocco Tested-by: Mirko Montanari --- block/bfq-mq-iosched.c | 29 +++++++++++++++++++++-------- block/bfq-sq-iosched.c | 35 +++++++++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 288078e68a2a..6130a95c6497 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -716,6 +716,15 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) return dur; } +/* switch back from soft real-time to interactive weight raising */ +static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, + struct bfq_data *bfqd) +{ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; +} + static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) @@ -753,12 +762,20 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time))) { - bfq_log_bfqq(bfqq->bfqd, bfqq, + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + !bfq_bfqq_in_large_burst(bfqq) && + time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + switch_back_to_interactive_wr(bfqq, bfqd); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching back to interactive"); + } else { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, "resume state: switching off wr (%lu + %lu < %lu)", bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, jiffies); - - bfqq->wr_coeff = 1; + } } /* make sure weight will be updated, however we got here */ @@ -3820,11 +3837,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_wr_duration(bfqd))) bfq_bfqq_end_wr(bfqq); else { - /* switch back to interactive wr */ - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - bfqq->last_wr_start_finish = - bfqq->wr_start_at_switch_to_srt; + switch_back_to_interactive_wr(bfqq, bfqd); BUG_ON(time_is_after_jiffies( bfqq->last_wr_start_finish)); bfqq->entity.prio_changed = 1; diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c index f4654436cd55..e07d5d1c0d40 100644 --- a/block/bfq-sq-iosched.c +++ b/block/bfq-sq-iosched.c @@ -678,6 +678,15 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) return dur; } +/* switch back from soft real-time to interactive weight raising */ +static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, + struct bfq_data *bfqd) +{ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; +} + static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) @@ -705,15 +714,29 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", + __func__, + bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, + bfqq->wr_cur_max_time); + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + bfqq->wr_cur_max_time))) { - bfq_log_bfqq(bfqq->bfqd, bfqq, + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + !bfq_bfqq_in_large_burst(bfqq) && + time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + switch_back_to_interactive_wr(bfqq, bfqd); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching back to interactive"); + } else { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, "resume state: switching off wr (%lu + %lu < %lu)", bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, jiffies); - - bfqq->wr_coeff = 1; + } } /* make sure weight will be updated, however we got here */ @@ -3703,11 +3726,7 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_wr_duration(bfqd))) bfq_bfqq_end_wr(bfqq); else { - /* switch back to interactive wr */ - bfqq->wr_coeff = bfqd->bfq_wr_coeff; - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - bfqq->last_wr_start_finish = - bfqq->wr_start_at_switch_to_srt; + switch_back_to_interactive_wr(bfqq, bfqd); BUG_ON(time_is_after_jiffies( bfqq->last_wr_start_finish)); bfqq->entity.prio_changed = 1; From 9eaec0c3a2d675763b09da81c9117a9c43bce942 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 15 Sep 2017 04:58:33 -0400 Subject: [PATCH 45/51] bfq-sq, bfq-mq: let early-merged queues be weight-raised on split too A just-created bfq_queue, say Q, may happen to be merged with another bfq_queue on the very first invocation of the function __bfq_insert_request. In such a case, even if Q would clearly deserve interactive weight raising (as it has just been created), the function bfq_add_request does not make it to be invoked for Q, and thus to activate weight raising for Q. As a consequence, when the state of Q is saved for a possible future restore, after a split of Q from the other bfq_queue(s), such a state happens to be (unjustly) non-weight-raised. Then the bfq_queue will not enjoy any weight raising on the split, even if should still be in an interactive weight-raising period when the split occurs. This commit solves this problem as follows, for a just-created bfq_queue that is being early-merged: it stores directly, in the saved state of the bfq_queue, the weight-raising state that would have been assigned to the bfq_queue if not early-merged. Signed-off-by: Paolo Valente Tested-by: Angelo Ruocco Tested-by: Mirko Montanari --- block/bfq-mq-iosched.c | 28 +++++++++++++++++++++++----- block/bfq-sq-iosched.c | 28 +++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 6130a95c6497..af84e506e897 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -2226,10 +2226,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + if (unlikely(bfq_bfqq_just_created(bfqq) && + !bfq_bfqq_in_large_burst(bfqq))) { + /* + * bfqq being merged ritgh after being created: bfqq + * would have deserved interactive weight raising, but + * did not make it to be set in a weight-raised state, + * because of this early merge. Store directly the + * weight-raising state that would have been assigned + * to bfqq, so that to avoid that bfqq unjustly fails + * to enjoy weight raising if split soon. + */ + bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); + bic->saved_last_wr_start_finish = jiffies; + } else { + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = + bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + } BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); bfq_log_bfqq(bfqq->bfqd, bfqq, "[%s] bic %p wr_coeff %d start_finish %lu max_time %lu", @@ -4560,7 +4577,6 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bfqq->allocated); new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); /* * If the bic associated with the process * issuing this request still points to bfqq @@ -4572,6 +4588,8 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); + + bfq_clear_bfqq_just_created(bfqq); /* * rq is about to be enqueued into new_bfqq, * release rq reference on bfqq diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c index e07d5d1c0d40..0c48f527fe3f 100644 --- a/block/bfq-sq-iosched.c +++ b/block/bfq-sq-iosched.c @@ -2105,10 +2105,27 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + if (unlikely(bfq_bfqq_just_created(bfqq) && + !bfq_bfqq_in_large_burst(bfqq))) { + /* + * bfqq being merged ritgh after being created: bfqq + * would have deserved interactive weight raising, but + * did not make it to be set in a weight-raised state, + * because of this early merge. Store directly the + * weight-raising state that would have been assigned + * to bfqq, so that to avoid that bfqq unjustly fails + * to enjoy weight raising if split soon. + */ + bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); + bic->saved_last_wr_start_finish = jiffies; + } else { + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = + bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + } BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); } @@ -4383,10 +4400,11 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) new_bfqq->allocated[rq_data_dir(rq)]++; bfqq->allocated[rq_data_dir(rq)]--; new_bfqq->ref++; - bfq_clear_bfqq_just_created(bfqq); if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); + + bfq_clear_bfqq_just_created(bfqq); /* * rq is about to be enqueued into new_bfqq, * release rq reference on bfqq From cb05150675095cb97ab22e4955eb82e4fe2e9dbe Mon Sep 17 00:00:00 2001 From: omcira Date: Mon, 18 Sep 2017 10:49:48 +0200 Subject: [PATCH 46/51] bfq-sq, bfq-mq: decrease burst size when queues in burst exit If many queues belonging to the same group happen to be created shortly after each other, then the concurrent processes associated with these queues have typically a common goal, and they get it done as soon as possible if not hampered by device idling. Examples are processes spawned by git grep, or by systemd during boot. As for device idling, this mechanism is currently necessary for weight raising to succeed in its goal: privileging I/O. In view of these facts, BFQ does not provide the above queues with either weight raising or device idling. On the other hand, a burst of queue creations may be caused also by the start-up of a complex application. In this case, these queues need usually to be served one after the other, and as quickly as possible, to maximise responsiveness. Therefore, in this case the best strategy is to weight-raise all the queues created during the burst, i.e., the exact opposite of the strategy for the above case. To distinguish between the two cases, BFQ uses an empirical burst-size threshold, found through extensive tests and monitoring of daily usage. Only large bursts, i.e., burst with a size above this threshold, are considered as generated by a high number of parallel processes. In this respect, upstart-based boot proved to be rather hard to detect as generating a large burst of queue creations, because with upstart most of the queues created in a burst exit *before* the next queues in the same burst are created. To address this issue, I changed the burst-detection mechanism so as to not decrease the size of the current burst even if one of the queues in the burst is eliminated. Unfortunately, this missing decrease causes false positives on very fast systems: on the start-up of a complex application, such as libreoffice writer, so many queues are created, served and exited shortly after each other, that a large burst of queue creations is wrongly detected as occurring. These false positives just disappear if the size of a burst is decreased when one of the queues in the burst exits. This commit restores the missing burst-size decrease, relying of the fact that upstart is apparently unlikely to be used on systems running this and future versions of the kernel. Signed-off-by: Paolo Valente Signed-off-by: Mauro Andreolini Signed-off-by: Angelo Ruocco Tested-by: Mirko Montanari --- block/bfq-mq-iosched.c | 12 +++--------- block/bfq-sq-iosched.c | 12 +++--------- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index af84e506e897..6e413d7236ce 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4111,16 +4111,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); - if (bfq_bfqq_sync(bfqq)) - /* - * The fact that this queue is being destroyed does not - * invalidate the fact that this queue may have been - * activated during the current burst. As a consequence, - * although the queue does not exist anymore, and hence - * needs to be removed from the burst list if there, - * the burst size has not to be decremented. - */ + if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { hlist_del_init(&bfqq->burst_list_node); + bfqq->bfqd->burst_size--; + } if (bfqq->bfqd) bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c index 0c48f527fe3f..93034dd7b801 100644 --- a/block/bfq-sq-iosched.c +++ b/block/bfq-sq-iosched.c @@ -3945,16 +3945,10 @@ static void bfq_put_queue(struct bfq_queue *bfqq) BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); - if (bfq_bfqq_sync(bfqq)) - /* - * The fact that this queue is being destroyed does not - * invalidate the fact that this queue may have been - * activated during the current burst. As a consequence, - * although the queue does not exist anymore, and hence - * needs to be removed from the burst list if there, - * the burst size has not to be decremented. - */ + if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { hlist_del_init(&bfqq->burst_list_node); + bfqq->bfqd->burst_size--; + } bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); From 60de7307d5e3ed7f272f12c900f631bdfe114db2 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 6 Oct 2017 19:35:38 +0200 Subject: [PATCH 47/51] bfq-sq, bfq-mq: fix unbalanced decrements of burst size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commit "bfq-sq, bfq-mq: decrease burst size when queues in burst exit" introduced the decrement of burst_size on the removal of a bfq_queue from the burst list. Unfortunately, this decrement can happen to be performed even when burst size is already equal to 0, because of unbalanced decrements. A description follows of the cause of these unbalanced decrements, namely a wrong assumption, and of the way how this wrong assumption leads to unbalanced decrements. The wrong assumption is that a bfq_queue can exit only if the process associated with the bfq_queue has exited. This is false, because a bfq_queue, say Q, may exit also as a consequence of a merge with another bfq_queue. In this case, Q exits because the I/O of its associated process has been redirected to another bfq_queue. The decrement unbalance occurs because Q may then be re-created after a split, and added back to the current burst list, *without* incrementing burst_size. burst_size is not incremented because Q is not a new bfq_queue added to the burst list, but a bfq_queue only temporarily removed from the list, and, before the commit "bfq-sq, bfq-mq: decrease burst size when queues in burst exit", burst_size was not decremented when Q was removed. This commit addresses this issue by just checking whether the exiting bfq_queue is a merged bfq_queue, and, in that case, not decrementing burst_size. Unfortunately, this still leaves room for unbalanced decrements, in the following rarer case: on a split, the bfq_queue happens to be inserted into a different burst list than that it was removed from when merged. If this happens, the number of elements in the new burst list becomes higher than burst_size (by one). When the bfq_queue then exits, it is of course not in a merged state any longer, thus burst_size is decremented, which results in an unbalanced decrement. To handle this sporadic, unlucky case in a simple way, this commit also checks that burst_size is larger than 0 before decrementing it. Finally, this commit removes an useless, extra check: the check that the bfq_queue is sync, performed before checking whether the bfq_queue is in the burst list. This extra check is redundant, because only sync bfq_queues can be inserted into the burst list. Reported-by: Philip Müller Signed-off-by: Paolo Valente Signed-off-by: Angelo Ruocco Tested-by: Philip Müller Tested-by: Oleksandr Natalenko Tested-by: Lee Tibbert --- block/bfq-mq-iosched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- block/bfq-sq-iosched.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 114 insertions(+), 4 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 6e413d7236ce..816bac6cdd3d 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4111,9 +4111,36 @@ static void bfq_put_queue(struct bfq_queue *bfqq) BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); - if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { + if (!hlist_unhashed(&bfqq->burst_list_node)) { hlist_del_init(&bfqq->burst_list_node); - bfqq->bfqd->burst_size--; + /* + * Decrement also burst size after the removal, if the + * process associated with bfqq is exiting, and thus + * does not contribute to the burst any longer. This + * decrement helps filter out false positives of large + * bursts, when some short-lived process (often due to + * the execution of commands by some service) happens + * to start and exit while a complex application is + * starting, and thus spawning several processes that + * do I/O (and that *must not* be treated as a large + * burst, see comments on bfq_handle_burst). + * + * In particular, the decrement is performed only if: + * 1) bfqq is not a merged queue, because, if it is, + * then this free of bfqq is not triggered by the exit + * of the process bfqq is associated with, but exactly + * by the fact that bfqq has just been merged. + * 2) burst_size is greater than 0, to handle + * unbalanced decrements. Unbalanced decrements may + * happen in te following case: bfqq is inserted into + * the current burst list--without incrementing + * bust_size--because of a split, but the current + * burst list is not the burst list bfqq belonged to + * (see comments on the case of a split in + * bfq_set_request). + */ + if (bfqq->bic && bfqq->bfqd->burst_size > 0) + bfqq->bfqd->burst_size--; } if (bfqq->bfqd) @@ -4940,6 +4967,34 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, "large burst"); bfq_clear_bfqq_in_large_burst(bfqq); if (bic->was_in_burst_list) + /* + * If bfqq was in the current + * burst list before being + * merged, then we have to add + * it back. And we do not need + * to increase burst_size, as + * we did not decrement + * burst_size when we removed + * bfqq from the burst list as + * a consequence of a merge + * (see comments in + * bfq_put_queue). In this + * respect, it would be rather + * costly to know whether the + * current burst list is still + * the same burst list from + * which bfqq was removed on + * the merge. To avoid this + * cost, if bfqq was in a + * burst list, then we add + * bfqq to the current burst + * list without any further + * check. This can cause + * inappropriate insertions, + * but rarely enough to not + * harm the detection of large + * bursts significantly. + */ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c index 93034dd7b801..4bbd7f4c0154 100644 --- a/block/bfq-sq-iosched.c +++ b/block/bfq-sq-iosched.c @@ -3945,9 +3945,36 @@ static void bfq_put_queue(struct bfq_queue *bfqq) BUG_ON(bfqq->entity.tree); BUG_ON(bfq_bfqq_busy(bfqq)); - if (bfq_bfqq_sync(bfqq) && !hlist_unhashed(&bfqq->burst_list_node)) { + if (!hlist_unhashed(&bfqq->burst_list_node)) { hlist_del_init(&bfqq->burst_list_node); - bfqq->bfqd->burst_size--; + /* + * Decrement also burst size after the removal, if the + * process associated with bfqq is exiting, and thus + * does not contribute to the burst any longer. This + * decrement helps filter out false positives of large + * bursts, when some short-lived process (often due to + * the execution of commands by some service) happens + * to start and exit while a complex application is + * starting, and thus spawning several processes that + * do I/O (and that *must not* be treated as a large + * burst, see comments on bfq_handle_burst). + * + * In particular, the decrement is performed only if: + * 1) bfqq is not a merged queue, because, if it is, + * then this free of bfqq is not triggered by the exit + * of the process bfqq is associated with, but exactly + * by the fact that bfqq has just been merged. + * 2) burst_size is greater than 0, to handle + * unbalanced decrements. Unbalanced decrements may + * happen in te following case: bfqq is inserted into + * the current burst list--without incrementing + * bust_size--because of a split, but the current + * burst list is not the burst list bfqq belonged to + * (see comments on the case of a split in + * bfq_set_request). + */ + if (bfqq->bic && bfqq->bfqd->burst_size > 0) + bfqq->bfqd->burst_size--; } bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); @@ -4691,6 +4718,34 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, "large burst"); bfq_clear_bfqq_in_large_burst(bfqq); if (bic->was_in_burst_list) + /* + * If bfqq was in the current + * burst list before being + * merged, then we have to add + * it back. And we do not need + * to increase burst_size, as + * we did not decrement + * burst_size when we removed + * bfqq from the burst list as + * a consequence of a merge + * (see comments in + * bfq_put_queue). In this + * respect, it would be rather + * costly to know whether the + * current burst list is still + * the same burst list from + * which bfqq was removed on + * the merge. To avoid this + * cost, if bfqq was in a + * burst list, then we add + * bfqq to the current burst + * list without any further + * check. This can cause + * inappropriate insertions, + * but rarely enough to not + * harm the detection of large + * bursts significantly. + */ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); } From 09adbd0f46f4ba395964b35bf611b7cc3dd84b4d Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Mon, 30 Oct 2017 16:50:50 +0100 Subject: [PATCH 48/51] doc, block, bfq-mq: update max IOPS sustainable with BFQ We have investigated more deeply the performance of BFQ, in terms of number of IOPS that can be processed by the CPU when BFQ is used as I/O scheduler. In more detail, using the script [1], we have measured the number of IOPS reached on top of a null block device configured with zero latency, as a function of the workload (sequential read, sequential write, random read, random write) and of the system (we considered desktops, laptops and embedded systems). Basing on the resulting figures, with this commit we update the current, conservative IOPS range reported in BFQ documentation. In particular, the documentation now reports, for each of three different systems, the lowest number of IOPS obtained for that system with the above test (namely, the value obtained with the workload leading to the lowest IOPS). [1] https://github.com/Algodev-github/IOSpeed Signed-off-by: Paolo Valente Signed-off-by: Luca Miccio --- Documentation/block/bfq-iosched.txt | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index dcfe15523da3..595ff7a5ff34 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -29,12 +29,19 @@ for that device, by setting low_latency to 0. See Section 3 for details on how to configure BFQ for the desired tradeoff between latency and throughput, or on how to maximize throughput. -On average CPUs, the current version of BFQ can handle devices -performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a -reference, 30-50 KIOPS correspond to very high bandwidths with -sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and -to 120-200 MB/s with 4KB random I/O. BFQ is currently being tested on -multi-queue devices too. +BFQ has a non-null overhead, which limits the maximum IOPS that the +CPU can process for a device scheduled with BFQ. To give an idea of +the limits on slow or average CPUs, here are BFQ limits for three +different CPUs, on, respectively, an average laptop, an old desktop, +and a cheap embedded system, in case full hierarchical support is +enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set for bfq-sq, or +CONFIG_MQ_BFQ_GROUP_IOSCHED is set for bfq-mq, or, finally, +CONFIG_BFQ_GROUP_IOSCHED is set for bfq): +- Intel i7-4850HQ: 250 KIOPS +- AMD A8-3850: 170 KIOPS +- ARM CortexTM-A53 Octa-core: 45 KIOPS + +BFQ works for multi-queue devices too (bfq and bfq-mq instances). The table of contents follows. Impatients can just jump to Section 3. From be94f97b577dc587593185224a7718aa59ac43f7 Mon Sep 17 00:00:00 2001 From: Luca Miccio Date: Tue, 31 Oct 2017 09:50:11 +0100 Subject: [PATCH 49/51] block, bfq-mq: add missing invocations of bfqg_stats_update_io_add/remove bfqg_stats_update_io_add and bfqg_stats_update_io_remove are to be invoked, respectively, when an I/O request enters and when an I/O request exits the scheduler. Unfortunately, bfq-mq does not fully comply with this scheme, because it does not invoke these functions for requests that are inserted into or extracted from its priority dispatch list. This commit fixes this mistake. Signed-off-by: Paolo Valente Signed-off-by: Luca Miccio --- block/bfq-mq-iosched.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index 816bac6cdd3d..fbf28804c220 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -1394,7 +1394,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, BUG_ON(bfqq->entity.budget < bfqq->entity.service); BUG_ON(bfqq == bfqd->in_service_queue); - bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); /* * bfqq deserves to be weight-raised if: @@ -1734,7 +1733,6 @@ static void bfq_remove_request(struct request_queue *q, BUG_ON(bfqq->meta_pending == 0); bfqq->meta_pending--; } - bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); } static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) @@ -1879,6 +1877,7 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfq_remove_request(q, next); + bfqg_stats_update_io_remove(bfqq_group(bfqq), next->cmd_flags); spin_unlock_irq(&bfqq->bfqd->lock); end: @@ -4077,6 +4076,10 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock_irq(&bfqd->lock); rq = __bfq_dispatch_request(hctx); + if (rq && RQ_BFQQ(rq)) + bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), + rq->cmd_flags); + spin_unlock_irq(&bfqd->lock); return rq; @@ -4634,6 +4637,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { @@ -4647,8 +4651,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, spin_lock_irq(&bfqd->lock); if (at_head || blk_rq_is_passthrough(rq)) { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - if (at_head) list_add(&rq->queuelist, &bfqd->dispatch); else @@ -4668,6 +4670,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, rq->rq_flags &= ~RQF_GOT; __bfq_insert_request(bfqd, rq); + /* + * Update bfqq, because, if a queue merge has occurred + * in __bfq_insert_request, then rq has been + * redirected into a new queue. + */ + bfqq = RQ_BFQQ(rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -4676,6 +4684,9 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } } + if (bfqq) + bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); + spin_unlock_irq(&bfqd->lock); } @@ -4893,8 +4904,11 @@ static void bfq_finish_request(struct request *rq) BUG_ON(in_interrupt()); assert_spin_locked(&bfqd->lock); - if (!RB_EMPTY_NODE(&rq->rb_node)) + if (!RB_EMPTY_NODE(&rq->rb_node)) { bfq_remove_request(rq->q, rq); + bfqg_stats_update_io_remove(bfqq_group(bfqq), + rq->cmd_flags); + } bfq_put_rq_priv_body(bfqq); } From 8659a1549d2bf241129a0f7c90429bddd9c2bc53 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Wed, 8 Nov 2017 19:07:40 +0100 Subject: [PATCH 50/51] block, bfq-mq: update blkio stats outside the scheduler lock bfq-mq invokes various blkg_*stats_* functions to update the statistics contained in the special files blkio.bfq-mq.* in the blkio controller groups, i.e., the I/O accounting related to the proportional-share policy provided by bfq-mq. The execution of these functions takes a considerable percentage, about 40%, of the total per-request execution time of bfq-mq (i.e., of the sum of the execution time of all the bfq-mq functions that have to be executed to process an I/O request from its creation to its destruction). This reduces the request-processing rate sustainable by bfq-mq noticeably, even on a multicore CPU. In fact, the bfq-mq functions that invoke blkg_*stats_* functions cannot be executed in parallel with the rest of the code of bfq-mq, because both are executed under the same same per-device scheduler lock. To reduce this slowdown, this commit moves, wherever possible, the invocation of these functions (more precisely, of the bfq-mq functions that invoke blkg_*stats_* functions) outside the critical sections protected by the scheduler lock. With this change, and with all blkio.bfq-mq.* statistics enabled, the throughput grows, e.g., from 250 to 310 KIOPS (+25%) on an Intel i7-4850HQ, in case of 8 threads doing random I/O in parallel on null_blk, with the latter configured with 0 latency. We obtained the same or higher throughput boosts, up to +30%, with other processors (some figures are reported in the documentation). For our tests, we used the script [1], with which our results can be easily reproduced. NOTE. This commit still protects the invocation of blkg_*stats_* functions with the request_queue lock, because the group these functions are invoked on may otherwise disappear before or while these functions are executed. Fortunately, tests without even this lock show, by difference, that the serialization caused by this lock has a little impact (at most ~5% of throughput reduction). [1] https://github.com/Algodev-github/IOSpeed Signed-off-by: Paolo Valente Signed-off-by: Luca Miccio --- Documentation/block/bfq-iosched.txt | 18 ++++-- block/bfq-mq-iosched.c | 112 +++++++++++++++++++++++++++++++----- block/bfq-sched.c | 2 + 3 files changed, 112 insertions(+), 20 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index 595ff7a5ff34..c816c595082d 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -31,16 +31,22 @@ latency and throughput, or on how to maximize throughput. BFQ has a non-null overhead, which limits the maximum IOPS that the CPU can process for a device scheduled with BFQ. To give an idea of -the limits on slow or average CPUs, here are BFQ limits for three -different CPUs, on, respectively, an average laptop, an old desktop, -and a cheap embedded system, in case full hierarchical support is -enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set for bfq-sq, or -CONFIG_MQ_BFQ_GROUP_IOSCHED is set for bfq-mq, or, finally, -CONFIG_BFQ_GROUP_IOSCHED is set for bfq): +the limits on slow or average CPUs, here are, first, the limits of +bfq-sq for three different CPUs, on, respectively, an average laptop, +an old desktop, and a cheap embedded system, in case full hierarchical +support is enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set): - Intel i7-4850HQ: 250 KIOPS - AMD A8-3850: 170 KIOPS - ARM CortexTM-A53 Octa-core: 45 KIOPS +bfq-mq and bfq instances reach, instead, a higher sustainable +throughput. Their limits, on the same systems as above, are, with full +hierarchical support enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED set +for bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED set for bfq): +- Intel i7-4850HQ: 310 KIOPS +- AMD A8-3850: 200 KIOPS +- ARM CortexTM-A53 Octa-core: 56 KIOPS + BFQ works for multi-queue devices too (bfq and bfq-mq instances). The table of contents follows. Impatients can just jump to Section 3. diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index fbf28804c220..ab3b83d612c2 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -1822,7 +1822,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, bfqq->next_rq = next_rq; bfq_log_bfqq(bfqd, bfqq, - "requests_merged: req %p prev %p next_rq %p bfqq %p", + "request_merged: req %p prev %p next_rq %p bfqq %p", req, prev, next_rq, bfqq); /* @@ -2415,7 +2415,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, struct bfq_queue *bfqq) { if (bfqq) { - bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); bfq_clear_bfqq_fifo_expire(bfqq); bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; @@ -3784,7 +3783,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) */ bfq_clear_bfqq_wait_request(bfqq); hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqg_stats_update_idle_time(bfqq_group(bfqq)); } goto keep_queue; } @@ -4072,16 +4070,67 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct request *rq; +#ifdef BFQ_GROUP_IOSCHED_ENABLED + struct bfq_queue *in_serv_queue, *bfqq; + bool waiting_rq, idle_timer_disabled; +#endif spin_lock_irq(&bfqd->lock); +#ifdef BFQ_GROUP_IOSCHED_ENABLED + in_serv_queue = bfqd->in_service_queue; + waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); + rq = __bfq_dispatch_request(hctx); - if (rq && RQ_BFQQ(rq)) - bfqg_stats_update_io_remove(bfqq_group(RQ_BFQQ(rq)), - rq->cmd_flags); + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + +#else + rq = __bfq_dispatch_request(hctx); +#endif spin_unlock_irq(&bfqd->lock); +#ifdef BFQ_GROUP_IOSCHED_ENABLED + bfqq = rq ? RQ_BFQQ(rq) : NULL; + if (!idle_timer_disabled && !bfqq) + return rq; + + /* + * rq and bfqq are guaranteed to exist until this function + * ends, for the following reasons. First, rq can be + * dispatched to the device, and then can be completed and + * freed, only after this function ends. Second, rq cannot be + * merged (and thus freed because of a merge) any longer, + * because it has already started. Thus rq cannot be freed + * before this function ends, and, since rq has a reference to + * bfqq, the same guarantee holds for bfqq too. + * + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ + spin_lock_irq(hctx->queue->queue_lock); + if (idle_timer_disabled) + /* + * Since the idle timer has been disabled, + * in_serv_queue contained some request when + * __bfq_dispatch_request was invoked above, which + * implies that rq was picked exactly from + * in_serv_queue. Thus in_serv_queue == bfqq, and is + * therefore guaranteed to exist because of the above + * arguments. + */ + bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); + if (bfqq) { + struct bfq_group *bfqg = bfqq_group(bfqq); + + bfqg_stats_update_avg_queue_size(bfqg); + bfqg_stats_set_start_empty_time(bfqg); + bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); + } + spin_unlock_irq(hctx->queue->queue_lock); +#endif + return rq; } @@ -4200,7 +4249,6 @@ static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) unsigned long flags; spin_lock_irqsave(&bfqd->lock, flags); - bfq_exit_bfqq(bfqd, bfqq); bic_set_bfqq(bic, NULL, is_sync); spin_unlock_irqrestore(&bfqd->lock, flags); @@ -4554,7 +4602,6 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfq_clear_bfqq_wait_request(bfqq); hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqg_stats_update_idle_time(bfqq_group(bfqq)); /* * The queue is not empty, because a new request just @@ -4569,9 +4616,11 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, } } -static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) +/* returns true if it causes the idle timer to be disabled */ +static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; + bool waiting, idle_timer_disabled = false; BUG_ON(!bfqq); assert_spin_locked(&bfqd->lock); @@ -4624,12 +4673,16 @@ static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) } } + waiting = bfqq && bfq_bfqq_wait_request(bfqq); bfq_add_request(rq); + idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); bfq_rq_enqueued(bfqd, bfqq, rq); + + return idle_timer_disabled; } static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, @@ -4638,6 +4691,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq = RQ_BFQQ(rq); +#ifdef BFQ_GROUP_IOSCHED_ENABLED + bool idle_timer_disabled = false; + unsigned int cmd_flags; +#endif spin_lock_irq(&bfqd->lock); if (blk_mq_sched_try_insert_merge(q, rq)) { @@ -4669,13 +4726,17 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, BUG_ON(!(rq->rq_flags & RQF_GOT)); rq->rq_flags &= ~RQF_GOT; - __bfq_insert_request(bfqd, rq); +#ifdef BFQ_GROUP_IOSCHED_ENABLED + idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred * in __bfq_insert_request, then rq has been * redirected into a new queue. */ bfqq = RQ_BFQQ(rq); +#else + __bfq_insert_request(bfqd, rq); +#endif if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -4683,11 +4744,34 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, q->last_merge = rq; } } - - if (bfqq) - bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, rq->cmd_flags); - +#ifdef BFQ_GROUP_IOSCHED_ENABLED + /* + * Cache cmd_flags before releasing scheduler lock, because rq + * may disappear afterwards (for example, because of a request + * merge). + */ + cmd_flags = rq->cmd_flags; +#endif spin_unlock_irq(&bfqd->lock); +#ifdef BFQ_GROUP_IOSCHED_ENABLED + if (!bfqq) + return; + /* + * bfqq still exists, because it can disappear only after + * either it is merged with another queue, or the process it + * is associated with exits. But both actions must be taken by + * the same process currently executing this flow of + * instruction. + * + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ + spin_lock_irq(q->queue_lock); + bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); + if (idle_timer_disabled) + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + spin_unlock_irq(q->queue_lock); +#endif } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, diff --git a/block/bfq-sched.c b/block/bfq-sched.c index e4a2553a2d2c..616c0692335a 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c @@ -949,9 +949,11 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) st->vtime += bfq_delta(served, st->wsum); bfq_forget_idle(st); } +#ifndef BFQ_MQ #ifdef BFQ_GROUP_IOSCHED_ENABLED bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); #endif +#endif st = bfq_entity_service_tree(&bfqq->entity); bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", served, ((st->vtime>>10)*1000)>>12, st); From abdfb33a3325df55ec0261fd824ca61ddac13575 Mon Sep 17 00:00:00 2001 From: Luca Miccio Date: Wed, 8 Nov 2017 19:07:41 +0100 Subject: [PATCH 51/51] block, bfq-sq, bfq-mq: move debug blkio stats behind CONFIG_DEBUG_BLK_CGROUP BFQ (both bfq-mq and bfq-sq) currently creates, and updates, its own instance of the whole set of blkio statistics that cfq creates. Yet, from the comments of Tejun Heo in [1], it turned out that most of these statistics are meant/useful only for debugging. This commit makes BFQ create the latter, debugging statistics only if the option CONFIG_DEBUG_BLK_CGROUP is set. By doing so, this commit also enables BFQ to enjoy a high perfomance boost. The reason is that, if CONFIG_DEBUG_BLK_CGROUP is not set, then BFQ has to update far fewer statistics, and, in particular, not the heaviest to update. To give an idea of the benefits, if CONFIG_DEBUG_BLK_CGROUP is not set, then, on an Intel i7-4850HQ, and with 8 threads doing random I/O in parallel on null_blk (configured with 0 latency), the throughput of bfq-mq grows from 310 to 400 KIOPS (+30%). We have measured similar or even much higher boosts with other CPUs: e.g., +45% with an ARM CortexTM-A53 Octa-core. Our results have been obtained and can be reproduced very easily with the script in [1]. [1] https://www.spinics.net/lists/linux-block/msg18943.html Reported-by: Tejun Heo Signed-off-by: Luca Miccio Signed-off-by: Paolo Valente --- Documentation/block/bfq-iosched.txt | 59 ++++++++++--- block/bfq-cgroup-included.c | 163 ++++++++++++++++++++---------------- block/bfq-mq-iosched.c | 14 ++-- block/bfq-mq.h | 4 +- block/bfq.h | 4 +- 5 files changed, 147 insertions(+), 97 deletions(-) diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt index c816c595082d..30ef2dba85ad 100644 --- a/Documentation/block/bfq-iosched.txt +++ b/Documentation/block/bfq-iosched.txt @@ -29,24 +29,41 @@ for that device, by setting low_latency to 0. See Section 3 for details on how to configure BFQ for the desired tradeoff between latency and throughput, or on how to maximize throughput. -BFQ has a non-null overhead, which limits the maximum IOPS that the -CPU can process for a device scheduled with BFQ. To give an idea of -the limits on slow or average CPUs, here are, first, the limits of -bfq-sq for three different CPUs, on, respectively, an average laptop, +BFQ has a non-null overhead, which limits the maximum IOPS that a CPU +can process for a device scheduled with BFQ. To give an idea of the +limits on slow or average CPUs, here are, first, the limits of bfq-mq +and bfq for three different CPUs, on, respectively, an average laptop, an old desktop, and a cheap embedded system, in case full hierarchical -support is enabled (i.e., CONFIG_BFQ_SQ_GROUP_IOSCHED is set): -- Intel i7-4850HQ: 250 KIOPS -- AMD A8-3850: 170 KIOPS -- ARM CortexTM-A53 Octa-core: 45 KIOPS - -bfq-mq and bfq instances reach, instead, a higher sustainable -throughput. Their limits, on the same systems as above, are, with full -hierarchical support enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED set -for bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED set for bfq): +support is enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED is set for +bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED is set for bfq), but +CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): +- Intel i7-4850HQ: 400 KIOPS +- AMD A8-3850: 250 KIOPS +- ARM CortexTM-A53 Octa-core: 80 KIOPS + +As for bfq-sq, it cannot reach the above IOPS, because of the +inherent, lower parallelism of legacy blk and of the components within +it (including bfq-sq itself). In particular, results with +CONFIG_DEBUG_BLK_CGROUP unset are rather fluctuating. The limits +reported below for the case CONFIG_DEBUG_BLK_CGROUP set will however +provide a lower bound to the limits of bfq-sq. + +Turning back to bfq-mq and bfq, If CONFIG_DEBUG_BLK_CGROUP is set (and +of course full hierarchical support is enabled), then the sustainable +throughput with bfq-mq and bfq decreases, because all blkio.bfq* +statistics are created and updated (Section 4-2). For bfq-mq and bfq, +this leads to the following maximum sustainable throughputs, on the +same systems as above: - Intel i7-4850HQ: 310 KIOPS - AMD A8-3850: 200 KIOPS - ARM CortexTM-A53 Octa-core: 56 KIOPS +Finally, if CONFIG_DEBUG_BLK_CGROUP is set (and full hierarchical +support is enabled), then bfq-sq exhibits the following limits: +- Intel i7-4850HQ: 250 KIOPS +- AMD A8-3850: 170 KIOPS +- ARM CortexTM-A53 Octa-core: 45 KIOPS + BFQ works for multi-queue devices too (bfq and bfq-mq instances). The table of contents follows. Impatients can just jump to Section 3. @@ -524,6 +541,22 @@ BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" to set the weight of a group with the mainline BFQ is blkio.bfq.weight or io.bfq.weight. +As for cgroups-v1 (blkio controller), the exact set of stat files +created, and kept up-to-date by bfq*, depends on whether +CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq* creates all +the stat files documented in +Documentation/cgroup-v1/blkio-controller.txt. If, instead, +CONFIG_DEBUG_BLK_CGROUP is not set, then bfq* creates only the files +blkio.bfq*.io_service_bytes +blkio.bfq*.io_service_bytes_recursive +blkio.bfq*.io_serviced +blkio.bfq*.io_serviced_recursive + +The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum +throughput sustainable with bfq*, because updating the blkio.bfq* +stats is rather costly, especially for some of the stats enabled by +CONFIG_DEBUG_BLK_CGROUP. + Parameters to set ----------------- diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c index 631e53d9150d..562b0ce581a7 100644 --- a/block/bfq-cgroup-included.c +++ b/block/bfq-cgroup-included.c @@ -15,7 +15,7 @@ * file. */ -#ifdef BFQ_GROUP_IOSCHED_ENABLED +#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* bfqg stats flags */ enum bfqg_stats_flags { @@ -155,6 +155,63 @@ static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) bfqg_stats_update_group_wait_time(stats); } +static void bfqg_stats_update_io_add(struct bfq_group *bfqg, + struct bfq_queue *bfqq, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.queued, op, 1); + bfqg_stats_end_empty_time(&bfqg->stats); + if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) + bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); +} + +static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.queued, op, -1); +} + +static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.merged, op, 1); +} + +static void bfqg_stats_update_completion(struct bfq_group *bfqg, + uint64_t start_time, uint64_t io_start_time, unsigned int op) +{ + struct bfqg_stats *stats = &bfqg->stats; + unsigned long long now = sched_clock(); + + if (time_after64(now, io_start_time)) + blkg_rwstat_add(&stats->service_time, op, + now - io_start_time); + if (time_after64(io_start_time, start_time)) + blkg_rwstat_add(&stats->wait_time, op, + io_start_time - start_time); +} + +#else /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ + +static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, + struct bfq_queue *bfqq, unsigned int op) { } +static inline void +bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } +static inline void +bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } +static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, + uint64_t start_time, uint64_t io_start_time, + unsigned int op) { } +static inline void +bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, + struct bfq_group *curr_bfqg) { } +static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } +static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } +static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } +static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } + +#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ + +#ifdef BFQ_GROUP_IOSCHED_ENABLED static struct blkcg_policy blkcg_policy_bfq; /* @@ -247,44 +304,10 @@ static void bfqg_and_blkg_put(struct bfq_group *bfqg) } #endif -static void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, - unsigned int op) -{ - blkg_rwstat_add(&bfqg->stats.queued, op, 1); - bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) - bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -} - -static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) -{ - blkg_rwstat_add(&bfqg->stats.queued, op, -1); -} - -static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) -{ - blkg_rwstat_add(&bfqg->stats.merged, op, 1); -} - -static void bfqg_stats_update_completion(struct bfq_group *bfqg, - uint64_t start_time, uint64_t io_start_time, - unsigned int op) -{ - struct bfqg_stats *stats = &bfqg->stats; - unsigned long long now = sched_clock(); - - if (time_after64(now, io_start_time)) - blkg_rwstat_add(&stats->service_time, op, - now - io_start_time); - if (time_after64(io_start_time, start_time)) - blkg_rwstat_add(&stats->wait_time, op, - io_start_time - start_time); -} - /* @stats = 0 */ static void bfqg_stats_reset(struct bfqg_stats *stats) { +#ifdef CONFIG_DEBUG_BLK_CGROUP /* queued stats shouldn't be cleared */ blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); @@ -296,6 +319,7 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) blkg_stat_reset(&stats->group_wait_time); blkg_stat_reset(&stats->idle_time); blkg_stat_reset(&stats->empty_time); +#endif } /* @to += @from */ @@ -304,6 +328,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) if (!to || !from) return; +#ifdef CONFIG_DEBUG_BLK_CGROUP /* queued stats shouldn't be cleared */ blkg_rwstat_add_aux(&to->merged, &from->merged); blkg_rwstat_add_aux(&to->service_time, &from->service_time); @@ -316,6 +341,7 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); blkg_stat_add_aux(&to->idle_time, &from->idle_time); blkg_stat_add_aux(&to->empty_time, &from->empty_time); +#endif } /* @@ -367,6 +393,7 @@ static void bfq_init_entity(struct bfq_entity *entity, static void bfqg_stats_exit(struct bfqg_stats *stats) { +#ifdef CONFIG_DEBUG_BLK_CGROUP blkg_rwstat_exit(&stats->merged); blkg_rwstat_exit(&stats->service_time); blkg_rwstat_exit(&stats->wait_time); @@ -378,10 +405,12 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) blkg_stat_exit(&stats->group_wait_time); blkg_stat_exit(&stats->idle_time); blkg_stat_exit(&stats->empty_time); +#endif } static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { +#ifdef CONFIG_DEBUG_BLK_CGROUP if (blkg_rwstat_init(&stats->merged, gfp) || blkg_rwstat_init(&stats->service_time, gfp) || blkg_rwstat_init(&stats->wait_time, gfp) || @@ -396,6 +425,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) bfqg_stats_exit(stats); return -ENOMEM; } +#endif return 0; } @@ -1003,6 +1033,7 @@ static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, return bfq_io_set_weight_legacy(of_css(of), NULL, weight); } +#ifdef CONFIG_DEBUG_BLK_CGROUP static int bfqg_print_stat(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, @@ -1108,6 +1139,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) 0, false); return 0; } +#endif /* CONFIG_DEBUG_BLK_CGROUP */ static struct bfq_group * bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) @@ -1137,15 +1169,6 @@ static struct cftype bfq_blkcg_legacy_files[] = { /* statistics, covers only the tasks in the bfqg */ { - .name = BFQ_CGROUP_FNAME(time), - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat, - }, - { - .name = BFQ_CGROUP_FNAME(sectors), - .seq_show = bfqg_print_stat_sectors, - }, - { .name = BFQ_CGROUP_FNAME(io_service_bytes), .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_bytes, @@ -1155,6 +1178,16 @@ static struct cftype bfq_blkcg_legacy_files[] = { .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios, }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = BFQ_CGROUP_FNAME(time), + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat, + }, + { + .name = BFQ_CGROUP_FNAME(sectors), + .seq_show = bfqg_print_stat_sectors, + }, { .name = BFQ_CGROUP_FNAME(io_service_time), .private = offsetof(struct bfq_group, stats.service_time), @@ -1175,18 +1208,10 @@ static struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.queued), .seq_show = bfqg_print_rwstat, }, +#endif /* CONFIG_DEBUG_BLK_CGROUP */ /* the same statictics which cover the bfqg and its descendants */ { - .name = BFQ_CGROUP_FNAME(time_recursive), - .private = offsetof(struct bfq_group, stats.time), - .seq_show = bfqg_print_stat_recursive, - }, - { - .name = BFQ_CGROUP_FNAME(sectors_recursive), - .seq_show = bfqg_print_stat_sectors_recursive, - }, - { .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_bytes_recursive, @@ -1196,6 +1221,16 @@ static struct cftype bfq_blkcg_legacy_files[] = { .private = (unsigned long)&blkcg_policy_bfq, .seq_show = blkg_print_stat_ios_recursive, }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = BFQ_CGROUP_FNAME(time_recursive), + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat_recursive, + }, + { + .name = BFQ_CGROUP_FNAME(sectors_recursive), + .seq_show = bfqg_print_stat_sectors_recursive, + }, { .name = BFQ_CGROUP_FNAME(io_service_time_recursive), .private = offsetof(struct bfq_group, stats.service_time), @@ -1240,6 +1275,7 @@ static struct cftype bfq_blkcg_legacy_files[] = { .private = offsetof(struct bfq_group, stats.dequeue), .seq_show = bfqg_print_stat, }, +#endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ }; @@ -1257,25 +1293,6 @@ static struct cftype bfq_blkg_files[] = { #else /* BFQ_GROUP_IOSCHED_ENABLED */ -static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, unsigned int op) { } -static inline void -bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -static inline void -bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, - uint64_t start_time, uint64_t io_start_time, - unsigned int op) { } -static inline void -bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, - struct bfq_group *curr_bfqg) { } -static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } - static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) {} diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c index ab3b83d612c2..0c09609a6099 100644 --- a/block/bfq-mq-iosched.c +++ b/block/bfq-mq-iosched.c @@ -4070,14 +4070,14 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct request *rq; -#ifdef BFQ_GROUP_IOSCHED_ENABLED +#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) struct bfq_queue *in_serv_queue, *bfqq; bool waiting_rq, idle_timer_disabled; #endif spin_lock_irq(&bfqd->lock); -#ifdef BFQ_GROUP_IOSCHED_ENABLED +#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) in_serv_queue = bfqd->in_service_queue; waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); @@ -4091,7 +4091,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) #endif spin_unlock_irq(&bfqd->lock); -#ifdef BFQ_GROUP_IOSCHED_ENABLED +#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) bfqq = rq ? RQ_BFQQ(rq) : NULL; if (!idle_timer_disabled && !bfqq) return rq; @@ -4691,7 +4691,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq = RQ_BFQQ(rq); -#ifdef BFQ_GROUP_IOSCHED_ENABLED +#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) bool idle_timer_disabled = false; unsigned int cmd_flags; #endif @@ -4726,7 +4726,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, BUG_ON(!(rq->rq_flags & RQF_GOT)); rq->rq_flags &= ~RQF_GOT; -#ifdef BFQ_GROUP_IOSCHED_ENABLED +#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* * Update bfqq, because, if a queue merge has occurred @@ -4744,7 +4744,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, q->last_merge = rq; } } -#ifdef BFQ_GROUP_IOSCHED_ENABLED +#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* * Cache cmd_flags before releasing scheduler lock, because rq * may disappear afterwards (for example, because of a request @@ -4753,7 +4753,7 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, cmd_flags = rq->cmd_flags; #endif spin_unlock_irq(&bfqd->lock); -#ifdef BFQ_GROUP_IOSCHED_ENABLED +#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) if (!bfqq) return; /* diff --git a/block/bfq-mq.h b/block/bfq-mq.h index 7ed2cc29be57..1cb05bb853d2 100644 --- a/block/bfq-mq.h +++ b/block/bfq-mq.h @@ -784,7 +784,7 @@ enum bfqq_expiration { struct bfqg_stats { -#ifdef BFQ_GROUP_IOSCHED_ENABLED +#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -812,7 +812,7 @@ struct bfqg_stats { uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; -#endif +#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ }; #ifdef BFQ_GROUP_IOSCHED_ENABLED diff --git a/block/bfq.h b/block/bfq.h index 15d326f466b7..47cd4d5a8c32 100644 --- a/block/bfq.h +++ b/block/bfq.h @@ -791,7 +791,7 @@ enum bfqq_expiration { struct bfqg_stats { -#ifdef BFQ_GROUP_IOSCHED_ENABLED +#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -819,7 +819,7 @@ struct bfqg_stats { uint64_t start_idle_time; uint64_t start_empty_time; uint16_t flags; -#endif +#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ }; #ifdef BFQ_GROUP_IOSCHED_ENABLED